import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
from scipy.stats import shapiro
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import KFold
plt.rcParams['figure.figsize'] = [10, 10]
hr= pd.read_csv("hr train data.csv")
hr.head()
| sno | enrollee_id | city | city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | training_hours | job_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 8949 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | NaN | NaN | 1 | 36 | YES |
| 1 | 1 | 29725 | city_40 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | 47 | NO |
| 2 | 2 | 11561 | city_21 | 0.624 | Male | No relevent experience | Full time course | Graduate | STEM | 5.0 | NaN | NaN | never | 83 | NO |
| 3 | 3 | 33241 | city_115 | 0.789 | Male | No relevent experience | NaN | Graduate | Business Degree | 0.0 | NaN | Pvt Ltd | never | 52 | YES |
| 4 | 4 | 666 | city_162 | 0.767 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 50-99 | Funded Startup | 4 | 8 | NO |
hr.columns
Index(['sno', 'enrollee_id', 'city', 'city_development_index', 'gender',
'relevent_experience', 'enrolled_university', 'education_level',
'major_discipline', 'experience', 'company_size', 'company_type',
'last_new_job', 'training_hours', 'job_change'],
dtype='object')
hr.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 19158 entries, 0 to 19157 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 sno 19158 non-null int64 1 enrollee_id 19158 non-null int64 2 city 19158 non-null object 3 city_development_index 19158 non-null float64 4 gender 19158 non-null object 5 relevent_experience 19158 non-null object 6 enrolled_university 18772 non-null object 7 education_level 19158 non-null object 8 major_discipline 19158 non-null object 9 experience 19093 non-null float64 10 company_size 13220 non-null object 11 company_type 13018 non-null object 12 last_new_job 18735 non-null object 13 training_hours 19158 non-null int64 14 job_change 19158 non-null object dtypes: float64(2), int64(3), object(10) memory usage: 2.2+ MB
(hr.isnull().sum()/len(hr))*100
sno 0.000000 enrollee_id 0.000000 city 0.000000 city_development_index 0.000000 gender 0.000000 relevent_experience 0.000000 enrolled_university 2.014824 education_level 0.000000 major_discipline 0.000000 experience 0.339284 company_size 30.994885 company_type 32.049274 last_new_job 2.207955 training_hours 0.000000 job_change 0.000000 dtype: float64
hr.drop(["sno","enrollee_id"], axis=1, inplace=True)
hr.select_dtypes(include="object").columns
Index(['city', 'gender', 'relevent_experience', 'enrolled_university',
'education_level', 'major_discipline', 'company_size', 'company_type',
'last_new_job', 'job_change'],
dtype='object')
hr.select_dtypes(exclude="object").columns
Index(['city_development_index', 'experience', 'training_hours'], dtype='object')
hr.head()
| city | city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | training_hours | job_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | NaN | NaN | 1 | 36 | YES |
| 1 | city_40 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | 47 | NO |
| 2 | city_21 | 0.624 | Male | No relevent experience | Full time course | Graduate | STEM | 5.0 | NaN | NaN | never | 83 | NO |
| 3 | city_115 | 0.789 | Male | No relevent experience | NaN | Graduate | Business Degree | 0.0 | NaN | Pvt Ltd | never | 52 | YES |
| 4 | city_162 | 0.767 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 50-99 | Funded Startup | 4 | 8 | NO |
hr.nunique()# unique values count
city 123 city_development_index 93 gender 3 relevent_experience 2 enrolled_university 3 education_level 5 major_discipline 6 experience 21 company_size 8 company_type 6 last_new_job 6 training_hours 241 job_change 2 dtype: int64
hr.enrolled_university.unique()
array(['no_enrollment', 'Full time course', nan, 'Part time course'],
dtype=object)
pd.set_option("display.max_rows", None)
hr.enrolled_university.value_counts()
no_enrollment 13817 Full time course 3757 Part time course 1198 Name: enrolled_university, dtype: int64
hr.enrolled_university.fillna("no_enrollment",inplace=True)
(hr.isnull().sum()/len(hr))*100
city 0.000000 city_development_index 0.000000 gender 0.000000 relevent_experience 0.000000 enrolled_university 0.000000 education_level 0.000000 major_discipline 0.000000 experience 0.339284 company_size 30.994885 company_type 32.049274 last_new_job 2.207955 training_hours 0.000000 job_change 0.000000 dtype: float64
hr.last_new_job.unique()
array(['1', '>4', 'never', '4', '3', '2', nan], dtype=object)
hr.last_new_job.value_counts()
1 8040 >4 3290 2 2900 never 2452 4 1029 3 1024 Name: last_new_job, dtype: int64
hr[(hr.relevent_experience=="No relevent experience") & (hr.last_new_job=="never")].count()
city 1751 city_development_index 1751 gender 1751 relevent_experience 1751 enrolled_university 1751 education_level 1751 major_discipline 1751 experience 1744 company_size 170 company_type 377 last_new_job 1751 training_hours 1751 job_change 1751 dtype: int64
hr[(hr.relevent_experience=="Has relevent experience")]["last_new_job"].value_counts()
1 6018 >4 2765 2 2370 4 876 3 839 never 701 Name: last_new_job, dtype: int64
hr[(hr.relevent_experience=="Has relevent experience") & (hr.last_new_job.isnull())]
| city | city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | training_hours | job_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 58 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 2.0 | Oct-49 | Funded Startup | NaN | 32 | YES |
| 205 | city_11 | 0.550 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 5.0 | 50-99 | Pvt Ltd | NaN | 7 | YES |
| 391 | city_90 | 0.698 | Male | Has relevent experience | Full time course | Masters | No Major | NaN | NaN | NaN | NaN | 44 | YES |
| 575 | city_45 | 0.890 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 9.0 | 50-99 | Pvt Ltd | NaN | 62 | NO |
| 719 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 11.0 | NaN | NaN | NaN | 11 | YES |
| 770 | city_16 | 0.910 | Male | Has relevent experience | Full time course | Masters | STEM | 5.0 | 50-99 | NaN | NaN | 92 | NO |
| 891 | city_146 | 0.735 | Male | Has relevent experience | Full time course | Graduate | STEM | 3.0 | 100-500 | NaN | NaN | 18 | NO |
| 909 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 9.0 | 5000-9999 | Pvt Ltd | NaN | 14 | NO |
| 994 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 2.0 | 100-500 | NaN | NaN | 37 | YES |
| 1187 | city_74 | 0.579 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 14.0 | Oct-49 | NaN | NaN | 57 | NO |
| 1323 | city_16 | 0.910 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | Oct-49 | NaN | NaN | 42 | NO |
| 1372 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | Other | 1.0 | <10 | NaN | NaN | 24 | YES |
| 1376 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | High School | No Major | 6.0 | NaN | NaN | NaN | 156 | NO |
| 1388 | city_91 | 0.691 | Male | Has relevent experience | Full time course | Graduate | STEM | 4.0 | NaN | NGO | NaN | 34 | YES |
| 1441 | city_23 | 0.899 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 5000-9999 | Pvt Ltd | NaN | 39 | NO |
| 1445 | city_11 | 0.550 | Male | Has relevent experience | no_enrollment | High School | No Major | 3.0 | NaN | NaN | NaN | 28 | YES |
| 1492 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 0.0 | 10000+ | Pvt Ltd | NaN | 28 | YES |
| 1585 | city_123 | 0.738 | Male | Has relevent experience | Full time course | Graduate | STEM | 13.0 | 100-500 | NaN | NaN | 73 | NO |
| 1734 | city_74 | 0.579 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 2.0 | NaN | NaN | NaN | 62 | YES |
| 1742 | city_73 | 0.754 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 7.0 | 500-999 | Pvt Ltd | NaN | 94 | NO |
| 1804 | city_123 | 0.738 | Male | Has relevent experience | Full time course | Masters | STEM | 8.0 | 10000+ | Pvt Ltd | NaN | 34 | NO |
| 1889 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Masters | STEM | 12.0 | NaN | Public Sector | NaN | 35 | NO |
| 1964 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Masters | STEM | 2.0 | <10 | Early Stage Startup | NaN | 54 | NO |
| 2007 | city_102 | 0.804 | Male | Has relevent experience | no_enrollment | Graduate | Other | 17.0 | NaN | NaN | NaN | 73 | NO |
| 2060 | city_100 | 0.887 | Male | Has relevent experience | Part time course | Graduate | STEM | 7.0 | NaN | NaN | NaN | 33 | NO |
| 2118 | city_160 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 14.0 | <10 | Pvt Ltd | NaN | 90 | NO |
| 2191 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 100-500 | NaN | NaN | 101 | NO |
| 2200 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 0.0 | 10000+ | Pvt Ltd | NaN | 38 | YES |
| 2355 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Phd | STEM | 16.0 | <10 | Funded Startup | NaN | 5 | NO |
| 2485 | city_165 | 0.903 | Male | Has relevent experience | no_enrollment | High School | No Major | 20.0 | Oct-49 | NaN | NaN | 50 | NO |
| 2552 | city_100 | 0.887 | Male | Has relevent experience | no_enrollment | Masters | STEM | 19.0 | 1000-4999 | Pvt Ltd | NaN | 104 | YES |
| 2727 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Masters | STEM | NaN | NaN | NaN | NaN | 70 | NO |
| 2733 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 4.0 | 50-99 | NaN | NaN | 19 | YES |
| 2751 | city_65 | 0.802 | Male | Has relevent experience | Full time course | Graduate | STEM | 6.0 | NaN | NaN | NaN | 100 | NO |
| 2781 | city_116 | 0.743 | Male | Has relevent experience | Full time course | Graduate | STEM | 2.0 | 5000-9999 | Pvt Ltd | NaN | 8 | NO |
| 2844 | city_21 | 0.624 | Male | Has relevent experience | Part time course | Phd | Other | 4.0 | 50-99 | Pvt Ltd | NaN | 8 | YES |
| 2893 | city_173 | 0.878 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | NaN | NaN | NaN | 45 | NO |
| 2961 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Masters | STEM | 5.0 | <10 | Early Stage Startup | NaN | 107 | YES |
| 3086 | city_11 | 0.550 | Male | Has relevent experience | Full time course | Primary School | No Major | NaN | <10 | Early Stage Startup | NaN | 62 | YES |
| 3128 | city_16 | 0.910 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 9.0 | 50-99 | Pvt Ltd | NaN | 17 | NO |
| 3216 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Masters | STEM | 2.0 | 50-99 | Pvt Ltd | NaN | 48 | YES |
| 3355 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | No Major | 18.0 | NaN | NaN | NaN | 10 | NO |
| 3456 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 500-999 | NGO | NaN | 77 | NO |
| 3564 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Masters | STEM | NaN | NaN | NaN | NaN | 34 | NO |
| 3719 | city_105 | 0.794 | Male | Has relevent experience | no_enrollment | Masters | STEM | 18.0 | 50-99 | Pvt Ltd | NaN | 21 | NO |
| 3805 | city_79 | 0.698 | Male | Has relevent experience | Full time course | Masters | STEM | 2.0 | 50-99 | Pvt Ltd | NaN | 95 | NO |
| 4022 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 10.0 | 50-99 | Pvt Ltd | NaN | 28 | NO |
| 4166 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | Other | 16.0 | Oct-49 | Pvt Ltd | NaN | 64 | NO |
| 4415 | city_105 | 0.794 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | NaN | NaN | NaN | 4 | NO |
| 4487 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 16.0 | NaN | NaN | NaN | 152 | NO |
| 4663 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 1.0 | 1000-4999 | Pvt Ltd | NaN | 131 | NO |
| 4671 | city_71 | 0.884 | Male | Has relevent experience | Full time course | Graduate | STEM | 8.0 | NaN | NaN | NaN | 39 | NO |
| 4733 | city_90 | 0.698 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 2.0 | <10 | Pvt Ltd | NaN | 134 | YES |
| 4739 | city_28 | 0.939 | Male | Has relevent experience | no_enrollment | High School | No Major | 6.0 | 50-99 | Pvt Ltd | NaN | 2 | NO |
| 4851 | city_100 | 0.887 | Male | Has relevent experience | Full time course | High School | No Major | 13.0 | 500-999 | NaN | NaN | 49 | NO |
| 4928 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 1.0 | <10 | Early Stage Startup | NaN | 85 | YES |
| 5053 | city_114 | 0.926 | Male | Has relevent experience | no_enrollment | High School | No Major | 9.0 | 5000-9999 | NaN | NaN | 64 | NO |
| 5070 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 3.0 | NaN | Pvt Ltd | NaN | 56 | YES |
| 5102 | city_28 | 0.939 | Male | Has relevent experience | Full time course | Graduate | STEM | 4.0 | 50-99 | Pvt Ltd | NaN | 105 | NO |
| 5106 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 1.0 | 50-99 | Pvt Ltd | NaN | 21 | YES |
| 5188 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Masters | STEM | 6.0 | 100-500 | Pvt Ltd | NaN | 172 | YES |
| 5190 | city_162 | 0.767 | Male | Has relevent experience | no_enrollment | Graduate | No Major | 16.0 | NaN | NaN | NaN | 43 | NO |
| 5204 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | High School | No Major | 3.0 | Oct-49 | Early Stage Startup | NaN | 118 | YES |
| 5252 | city_98 | 0.949 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 1000-4999 | Pvt Ltd | NaN | 62 | NO |
| 5356 | city_160 | 0.920 | Male | Has relevent experience | Full time course | Graduate | STEM | 8.0 | 100-500 | Pvt Ltd | NaN | 8 | NO |
| 5414 | city_11 | 0.550 | Male | Has relevent experience | Full time course | Graduate | STEM | 4.0 | Oct-49 | Pvt Ltd | NaN | 7 | YES |
| 5600 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 2.0 | Oct-49 | Public Sector | NaN | 36 | YES |
| 5670 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | NaN | NaN | Funded Startup | NaN | 27 | NO |
| 5715 | city_114 | 0.926 | Male | Has relevent experience | no_enrollment | Masters | STEM | 5.0 | 50-99 | Funded Startup | NaN | 12 | NO |
| 5729 | city_134 | 0.698 | Male | Has relevent experience | no_enrollment | Graduate | No Major | 7.0 | <10 | Pvt Ltd | NaN | 47 | NO |
| 5812 | city_114 | 0.926 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 6.0 | Oct-49 | NaN | NaN | 3 | NO |
| 5817 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 5.0 | 50-99 | Early Stage Startup | NaN | 32 | NO |
| 5886 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 0.0 | Oct-49 | NaN | NaN | 12 | YES |
| 5951 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Masters | STEM | 5.0 | 50-99 | Pvt Ltd | NaN | 150 | NO |
| 5992 | city_114 | 0.926 | Male | Has relevent experience | Part time course | Graduate | STEM | 9.0 | Oct-49 | Pvt Ltd | NaN | 51 | NO |
| 6033 | city_152 | 0.698 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 10000+ | Pvt Ltd | NaN | 149 | NO |
| 6064 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 3.0 | NaN | NaN | NaN | 72 | NO |
| 6112 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 0.0 | 500-999 | NaN | NaN | 79 | YES |
| 6205 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 3.0 | 10000+ | Public Sector | NaN | 55 | NO |
| 6377 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Masters | STEM | 5.0 | 100-500 | NaN | NaN | 28 | YES |
| 6397 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 11.0 | 10000+ | Pvt Ltd | NaN | 41 | YES |
| 6425 | city_103 | 0.920 | Male | Has relevent experience | Full time course | Graduate | STEM | 4.0 | 100-500 | NaN | NaN | 78 | YES |
| 6554 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Masters | STEM | NaN | 10000+ | NaN | NaN | 64 | NO |
| 6587 | city_149 | 0.689 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 17.0 | 1000-4999 | Pvt Ltd | NaN | 48 | NO |
| 6600 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 3.0 | 100-500 | NaN | NaN | 16 | NO |
| 6666 | city_19 | 0.682 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 12.0 | NaN | NaN | NaN | 51 | YES |
| 6690 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 0.0 | Oct-49 | NaN | NaN | 8 | YES |
| 6792 | city_102 | 0.804 | Male | Has relevent experience | Full time course | Graduate | STEM | 4.0 | NaN | NaN | NaN | 45 | NO |
| 6853 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 9.0 | 100-500 | Pvt Ltd | NaN | 88 | YES |
| 7122 | city_74 | 0.579 | Male | Has relevent experience | Full time course | Graduate | STEM | 6.0 | Oct-49 | Pvt Ltd | NaN | 21 | YES |
| 7163 | city_11 | 0.550 | Male | Has relevent experience | Part time course | Primary School | No Major | 1.0 | NaN | NaN | NaN | 9 | YES |
| 7173 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Masters | STEM | 12.0 | NaN | NaN | NaN | 19 | NO |
| 7242 | city_99 | 0.915 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 12.0 | 1000-4999 | Pvt Ltd | NaN | 20 | NO |
| 7307 | city_114 | 0.926 | Male | Has relevent experience | Full time course | Masters | STEM | 10.0 | NaN | Public Sector | NaN | 31 | NO |
| 7330 | city_116 | 0.743 | Male | Has relevent experience | no_enrollment | Graduate | No Major | 7.0 | NaN | NaN | NaN | 68 | YES |
| 7357 | city_173 | 0.878 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | NaN | NaN | NaN | 29 | NO |
| 7439 | city_69 | 0.856 | Male | Has relevent experience | no_enrollment | High School | No Major | 20.0 | 100-500 | Pvt Ltd | NaN | 35 | YES |
| 7483 | city_173 | 0.878 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 50-99 | Funded Startup | NaN | 19 | NO |
| 7608 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | Other | NaN | 10000+ | Public Sector | NaN | 91 | NO |
| 7670 | city_162 | 0.767 | Male | Has relevent experience | Full time course | Graduate | STEM | 7.0 | 50-99 | NaN | NaN | 17 | NO |
| 7791 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | NaN | 100-500 | Pvt Ltd | NaN | 256 | NO |
| 7838 | city_114 | 0.926 | Male | Has relevent experience | Part time course | Graduate | STEM | 4.0 | Oct-49 | Pvt Ltd | NaN | 12 | NO |
| 7940 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 7.0 | NaN | Public Sector | NaN | 11 | NO |
| 8034 | city_176 | 0.764 | Male | Has relevent experience | no_enrollment | Masters | STEM | 4.0 | NaN | NaN | NaN | 24 | NO |
| 8068 | city_143 | 0.740 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 14.0 | NaN | NaN | NaN | 7 | YES |
| 8108 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 16.0 | 100-500 | NaN | NaN | 99 | NO |
| 8145 | city_103 | 0.920 | Male | Has relevent experience | Part time course | Graduate | STEM | 9.0 | 1000-4999 | NGO | NaN | 136 | YES |
| 8345 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 6.0 | 50-99 | Pvt Ltd | NaN | 28 | NO |
| 8386 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | High School | No Major | 8.0 | NaN | NaN | NaN | 92 | NO |
| 8453 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | NaN | 500-999 | Pvt Ltd | NaN | 11 | NO |
| 8464 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 0.0 | <10 | Pvt Ltd | NaN | 92 | NO |
| 8534 | city_114 | 0.926 | Male | Has relevent experience | no_enrollment | High School | No Major | 20.0 | 50-99 | Pvt Ltd | NaN | 2 | NO |
| 8544 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Masters | STEM | 3.0 | <10 | Early Stage Startup | NaN | 13 | YES |
| 8570 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 100-500 | Pvt Ltd | NaN | 42 | NO |
| 8611 | city_162 | 0.767 | Male | Has relevent experience | Full time course | Graduate | STEM | 3.0 | 50-99 | Public Sector | NaN | 55 | NO |
| 8762 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 1.0 | <10 | NGO | NaN | 176 | YES |
| 8966 | city_75 | 0.939 | Male | Has relevent experience | no_enrollment | High School | No Major | 5.0 | <10 | Pvt Ltd | NaN | 22 | NO |
| 9163 | city_21 | 0.624 | Male | Has relevent experience | Part time course | Graduate | STEM | 5.0 | 1000-4999 | Pvt Ltd | NaN | 55 | NO |
| 9176 | city_173 | 0.878 | Male | Has relevent experience | Full time course | Masters | STEM | 20.0 | NaN | NaN | NaN | 28 | NO |
| 9636 | city_105 | 0.794 | Male | Has relevent experience | Full time course | High School | No Major | 0.0 | NaN | NaN | NaN | 18 | NO |
| 9805 | city_160 | 0.920 | Male | Has relevent experience | no_enrollment | Masters | STEM | 16.0 | NaN | NaN | NaN | 91 | YES |
| 9875 | city_160 | 0.920 | Male | Has relevent experience | Full time course | High School | No Major | 4.0 | Oct-49 | Pvt Ltd | NaN | 108 | NO |
| 9943 | city_16 | 0.910 | Male | Has relevent experience | no_enrollment | Masters | STEM | 14.0 | 1000-4999 | NaN | NaN | 47 | NO |
| 9959 | city_116 | 0.743 | Male | Has relevent experience | Full time course | High School | No Major | 2.0 | 100-500 | Pvt Ltd | NaN | 108 | NO |
| 9974 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Masters | STEM | NaN | 5000-9999 | Pvt Ltd | NaN | 216 | NO |
| 9990 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Masters | STEM | NaN | 100-500 | NaN | NaN | 46 | NO |
| 10084 | city_101 | 0.558 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 5.0 | Oct-49 | Early Stage Startup | NaN | 141 | NO |
| 10100 | city_73 | 0.754 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 11.0 | 1000-4999 | Pvt Ltd | NaN | 156 | NO |
| 10156 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Masters | STEM | 4.0 | 1000-4999 | Pvt Ltd | NaN | 64 | YES |
| 10180 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Masters | STEM | 8.0 | <10 | Early Stage Startup | NaN | 12 | NO |
| 10305 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 0.0 | 500-999 | Pvt Ltd | NaN | 39 | YES |
| 10362 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 2.0 | <10 | Funded Startup | NaN | 35 | NO |
| 10426 | city_65 | 0.802 | Male | Has relevent experience | no_enrollment | Primary School | No Major | 20.0 | 1000-4999 | Pvt Ltd | NaN | 6 | NO |
| 10478 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 1.0 | 1000-4999 | Pvt Ltd | NaN | 142 | NO |
| 10570 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | Humanities | 3.0 | NaN | NaN | NaN | 4 | NO |
| 10652 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Masters | STEM | 3.0 | Oct-49 | Pvt Ltd | NaN | 11 | NO |
| 10746 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Masters | STEM | 5.0 | Oct-49 | NaN | NaN | 10 | YES |
| 10789 | city_59 | 0.775 | Male | Has relevent experience | Full time course | Graduate | STEM | 8.0 | 50-99 | Pvt Ltd | NaN | 164 | NO |
| 10853 | city_160 | 0.920 | Male | Has relevent experience | Full time course | Masters | STEM | 5.0 | 50-99 | Pvt Ltd | NaN | 24 | NO |
| 10872 | city_162 | 0.767 | Male | Has relevent experience | Full time course | Graduate | STEM | 3.0 | NaN | NaN | NaN | 59 | YES |
| 10961 | city_160 | 0.920 | Male | Has relevent experience | no_enrollment | Masters | Humanities | 20.0 | <10 | Other | NaN | 42 | NO |
| 11191 | city_143 | 0.740 | Male | Has relevent experience | no_enrollment | Masters | STEM | 9.0 | 5000-9999 | NaN | NaN | 43 | YES |
| 11467 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 4.0 | 500-999 | Pvt Ltd | NaN | 46 | YES |
| 11539 | city_166 | 0.649 | Male | Has relevent experience | Full time course | Primary School | No Major | 5.0 | <10 | Other | NaN | 188 | NO |
| 11565 | city_76 | 0.698 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | Oct-49 | Pvt Ltd | NaN | 10 | NO |
| 11677 | city_42 | 0.563 | Female | Has relevent experience | Full time course | Phd | STEM | 0.0 | NaN | NaN | NaN | 43 | YES |
| 11730 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 2.0 | 50-99 | Pvt Ltd | NaN | 10 | NO |
| 11766 | city_21 | 0.624 | Female | Has relevent experience | Full time course | Graduate | STEM | 5.0 | NaN | NaN | NaN | 145 | NO |
| 11786 | city_28 | 0.939 | Female | Has relevent experience | no_enrollment | Primary School | No Major | 18.0 | 50-99 | Pvt Ltd | NaN | 47 | YES |
| 12119 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Masters | STEM | 1.0 | 50-99 | Pvt Ltd | NaN | 34 | NO |
| 12362 | city_97 | 0.925 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 19.0 | 1000-4999 | Pvt Ltd | NaN | 42 | NO |
| 12389 | city_103 | 0.920 | Female | Has relevent experience | Part time course | Graduate | STEM | 3.0 | NaN | NaN | NaN | 30 | NO |
| 12395 | city_123 | 0.738 | Female | Has relevent experience | no_enrollment | Graduate | STEM | NaN | 100-500 | Pvt Ltd | NaN | 9 | YES |
| 12527 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 12.0 | 50-99 | Pvt Ltd | NaN | 133 | NO |
| 12738 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 3.0 | 50-99 | Pvt Ltd | NaN | 8 | YES |
| 12772 | city_114 | 0.926 | Male | Has relevent experience | no_enrollment | Masters | STEM | 9.0 | 50-99 | Pvt Ltd | NaN | 22 | NO |
| 12810 | city_13 | 0.827 | Female | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | Oct-49 | Pvt Ltd | NaN | 17 | NO |
| 12914 | city_136 | 0.897 | Female | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | NaN | NaN | NaN | 85 | YES |
| 12920 | city_103 | 0.920 | Female | Has relevent experience | Full time course | Masters | STEM | 20.0 | NaN | NaN | NaN | 40 | NO |
| 12971 | city_114 | 0.926 | Male | Has relevent experience | no_enrollment | High School | No Major | 5.0 | 500-999 | Pvt Ltd | NaN | 10 | NO |
| 12987 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 1.0 | 10000+ | Pvt Ltd | NaN | 38 | NO |
| 13236 | city_103 | 0.920 | Female | Has relevent experience | no_enrollment | Primary School | No Major | 20.0 | NaN | NaN | NaN | 21 | NO |
| 13312 | city_90 | 0.698 | Female | Has relevent experience | Part time course | Primary School | No Major | 13.0 | 50-99 | Pvt Ltd | NaN | 18 | NO |
| 13419 | city_73 | 0.754 | Female | Has relevent experience | Part time course | High School | No Major | 0.0 | <10 | Early Stage Startup | NaN | 58 | NO |
| 13491 | city_103 | 0.920 | Male | Has relevent experience | Full time course | Graduate | STEM | 9.0 | NaN | NGO | NaN | 49 | YES |
| 13529 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | Other | 0.0 | 10000+ | Pvt Ltd | NaN | 66 | YES |
| 13569 | city_89 | 0.925 | Female | Has relevent experience | no_enrollment | Graduate | Business Degree | 13.0 | 10000+ | Pvt Ltd | NaN | 26 | NO |
| 13682 | city_114 | 0.926 | Female | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 500-999 | Pvt Ltd | NaN | 32 | NO |
| 13689 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 3.0 | 100-500 | Pvt Ltd | NaN | 18 | YES |
| 13720 | city_136 | 0.897 | Female | Has relevent experience | Part time course | Graduate | STEM | 4.0 | 100-500 | Pvt Ltd | NaN | 39 | NO |
| 13759 | city_91 | 0.691 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 5.0 | Oct-49 | Pvt Ltd | NaN | 7 | NO |
| 13866 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 4.0 | 50-99 | Early Stage Startup | NaN | 12 | YES |
| 14026 | city_114 | 0.926 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | NaN | NaN | NaN | 92 | NO |
| 14059 | city_103 | 0.920 | Female | Has relevent experience | no_enrollment | Masters | STEM | 13.0 | 10000+ | Pvt Ltd | NaN | 90 | NO |
| 14126 | city_16 | 0.910 | Female | Has relevent experience | Full time course | Graduate | STEM | 2.0 | NaN | NaN | NaN | 14 | NO |
| 14251 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Masters | STEM | 6.0 | Oct-49 | Pvt Ltd | NaN | 45 | YES |
| 14648 | city_21 | 0.624 | Female | Has relevent experience | Full time course | Graduate | STEM | 3.0 | 100-500 | Pvt Ltd | NaN | 102 | YES |
| 14895 | city_116 | 0.743 | Female | Has relevent experience | no_enrollment | Primary School | No Major | 7.0 | 10000+ | NaN | NaN | 18 | NO |
| 15022 | city_21 | 0.624 | Female | Has relevent experience | Full time course | Graduate | STEM | 6.0 | NaN | NaN | NaN | 42 | YES |
| 15098 | city_136 | 0.897 | Male | Has relevent experience | no_enrollment | Masters | STEM | 10.0 | NaN | NaN | NaN | 9 | NO |
| 15135 | city_116 | 0.743 | Female | Has relevent experience | no_enrollment | Masters | STEM | 5.0 | Oct-49 | Early Stage Startup | NaN | 94 | NO |
| 15315 | city_103 | 0.920 | Female | Has relevent experience | Full time course | High School | No Major | 6.0 | <10 | Pvt Ltd | NaN | 81 | NO |
| 15322 | city_21 | 0.624 | Female | Has relevent experience | Full time course | Graduate | STEM | 4.0 | 50-99 | NaN | NaN | 21 | YES |
| 15377 | city_9 | 0.743 | Female | Has relevent experience | Part time course | Graduate | Other | 4.0 | <10 | Pvt Ltd | NaN | 334 | YES |
| 15388 | city_21 | 0.624 | Female | Has relevent experience | Full time course | Graduate | STEM | 10.0 | 50-99 | Pvt Ltd | NaN | 40 | YES |
| 15393 | city_73 | 0.754 | Female | Has relevent experience | no_enrollment | Masters | No Major | NaN | 50-99 | Pvt Ltd | NaN | 5 | NO |
| 15571 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 6.0 | 50-99 | Funded Startup | NaN | 87 | NO |
| 15775 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | NaN | NaN | NaN | 20 | YES |
| 15822 | city_36 | 0.893 | Female | Has relevent experience | Part time course | High School | No Major | NaN | 100-500 | NaN | NaN | 94 | NO |
| 16129 | city_89 | 0.925 | Male | Has relevent experience | Full time course | Graduate | STEM | 11.0 | 50-99 | NaN | NaN | 270 | NO |
| 16279 | city_11 | 0.550 | Female | Has relevent experience | no_enrollment | Graduate | STEM | NaN | 1000-4999 | NaN | NaN | 46 | NO |
| 16402 | city_100 | 0.887 | Female | Has relevent experience | no_enrollment | Graduate | No Major | 5.0 | NaN | NaN | NaN | 12 | NO |
| 16531 | city_134 | 0.698 | Female | Has relevent experience | Full time course | Masters | Other | 20.0 | <10 | Early Stage Startup | NaN | 52 | NO |
| 16541 | city_114 | 0.926 | Female | Has relevent experience | Full time course | Graduate | STEM | 7.0 | 50-99 | NaN | NaN | 140 | NO |
| 16632 | city_116 | 0.743 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 9.0 | 100-500 | NaN | NaN | 144 | NO |
| 16757 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 8.0 | 10000+ | Pvt Ltd | NaN | 91 | NO |
| 16933 | city_76 | 0.698 | Female | Has relevent experience | no_enrollment | Masters | STEM | 1.0 | Oct-49 | Pvt Ltd | NaN | 29 | YES |
| 17004 | city_144 | 0.840 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 3.0 | 100-500 | Public Sector | NaN | 8 | NO |
| 17033 | city_143 | 0.740 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 9.0 | 100-500 | Pvt Ltd | NaN | 4 | NO |
| 17078 | city_103 | 0.920 | Female | Has relevent experience | no_enrollment | Masters | STEM | 16.0 | 500-999 | Pvt Ltd | NaN | 57 | NO |
| 17090 | city_123 | 0.738 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 4.0 | Oct-49 | Pvt Ltd | NaN | 23 | NO |
| 17093 | city_103 | 0.920 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 100-500 | NGO | NaN | 206 | NO |
| 17318 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 2.0 | 10000+ | Pvt Ltd | NaN | 43 | YES |
| 17453 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Primary School | No Major | 4.0 | NaN | NaN | NaN | 18 | YES |
| 17517 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Masters | STEM | 3.0 | NaN | NaN | NaN | 55 | YES |
| 17520 | city_90 | 0.698 | Male | Has relevent experience | Full time course | Graduate | STEM | 8.0 | Oct-49 | Pvt Ltd | NaN | 25 | YES |
| 17774 | city_19 | 0.682 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 4.0 | 500-999 | NaN | NaN | 12 | NO |
| 17821 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Masters | STEM | 0.0 | 100-500 | Pvt Ltd | NaN | 330 | YES |
| 17891 | city_136 | 0.897 | Male | Has relevent experience | Full time course | Graduate | STEM | NaN | Oct-49 | NGO | NaN | 26 | YES |
| 17977 | city_134 | 0.698 | Male | Has relevent experience | no_enrollment | Primary School | No Major | 20.0 | NaN | NaN | NaN | 51 | NO |
| 17996 | city_27 | 0.848 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 5.0 | 500-999 | NaN | NaN | 56 | NO |
| 18026 | city_162 | 0.767 | Male | Has relevent experience | no_enrollment | High School | No Major | 20.0 | 50-99 | Pvt Ltd | NaN | 102 | NO |
| 18060 | city_21 | 0.624 | Male | Has relevent experience | Full time course | Graduate | STEM | 1.0 | 50-99 | Pvt Ltd | NaN | 27 | YES |
| 18122 | city_100 | 0.887 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 10.0 | 5000-9999 | NaN | NaN | 23 | NO |
| 18133 | city_134 | 0.698 | Female | Has relevent experience | no_enrollment | Masters | STEM | 6.0 | NaN | NaN | NaN | 36 | NO |
| 18143 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | NaN | Oct-49 | NaN | NaN | 182 | YES |
| 18302 | city_21 | 0.624 | Female | Has relevent experience | no_enrollment | Graduate | STEM | 7.0 | 10000+ | Pvt Ltd | NaN | 24 | YES |
| 18473 | city_128 | 0.527 | Female | Has relevent experience | Part time course | High School | No Major | 2.0 | NaN | NaN | NaN | 50 | YES |
| 18648 | city_21 | 0.624 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 4.0 | 10000+ | Pvt Ltd | NaN | 82 | YES |
| 18772 | city_21 | 0.624 | Other | Has relevent experience | no_enrollment | Graduate | STEM | 2.0 | <10 | Early Stage Startup | NaN | 4 | NO |
| 18863 | city_103 | 0.920 | Other | Has relevent experience | no_enrollment | Graduate | Humanities | 19.0 | 10000+ | Pvt Ltd | NaN | 13 | YES |
| 18887 | city_21 | 0.624 | Other | Has relevent experience | no_enrollment | Graduate | STEM | 6.0 | Oct-49 | NaN | NaN | 39 | NO |
| 19139 | city_21 | 0.624 | Other | Has relevent experience | Full time course | Graduate | STEM | 4.0 | NaN | NaN | NaN | 13 | NO |
hr.last_new_job.fillna("never").value_counts().plot.bar()
# checking if filling with never will make changes in overall distribution
<AxesSubplot:>
sns.countplot(x="last_new_job", data=hr) #distrbution without filling na with never
plt.show()
hr.last_new_job.fillna("never",inplace=True)
sns.countplot(x="last_new_job", data=hr) # dist still intact
plt.show()
(hr.isnull().sum()/len(hr))*100
city 0.000000 city_development_index 0.000000 gender 0.000000 relevent_experience 0.000000 enrolled_university 0.000000 education_level 0.000000 major_discipline 0.000000 experience 0.339284 company_size 30.994885 company_type 32.049274 last_new_job 0.000000 training_hours 0.000000 job_change 0.000000 dtype: float64
hr.company_type.value_counts()
Pvt Ltd 9817 Funded Startup 1001 Public Sector 955 Early Stage Startup 603 NGO 521 Other 121 Name: company_type, dtype: int64
hr.company_type.unique()
array([nan, 'Pvt Ltd', 'Funded Startup', 'Early Stage Startup', 'Other',
'Public Sector', 'NGO'], dtype=object)
hr.company_type.fillna("Other", inplace=True) #if compnay_type not known make sense to fill with other
sns.countplot(x="company_type", data=hr)
plt.xticks(rotation=90)
plt.show()
hr.isnull().sum()
city 0 city_development_index 0 gender 0 relevent_experience 0 enrolled_university 0 education_level 0 major_discipline 0 experience 65 company_size 5938 company_type 0 last_new_job 0 training_hours 0 job_change 0 dtype: int64
hr.experience.unique()
array([20., 15., 5., 0., 11., 13., 7., 17., 2., 16., 1., 4., 10.,
14., 18., 19., 12., 3., 6., 9., 8., nan])
sns.displot(hr.experience)
plt.show()
hr.groupby("relevent_experience")["experience"].median()
relevent_experience Has relevent experience 10.0 No relevent experience 4.0 Name: experience, dtype: float64
hr.loc[(hr.relevent_experience=="Has relevent experience") & (hr.experience.isnull()), "experience"]=10.0
hr.loc[(hr.relevent_experience=="No relevent experience") & (hr.experience.isnull()), "experience"]=4.0
sns.displot(hr.experience)
plt.show()
hr.isnull().sum()
city 0 city_development_index 0 gender 0 relevent_experience 0 enrolled_university 0 education_level 0 major_discipline 0 experience 0 company_size 5938 company_type 0 last_new_job 0 training_hours 0 job_change 0 dtype: int64
hr.company_size.value_counts()
50-99 3083 100-500 2571 10000+ 2019 Oct-49 1471 1000-4999 1328 <10 1308 500-999 877 5000-9999 563 Name: company_size, dtype: int64
hr.company_size= hr.company_size.str.replace("Oct-49","49")
hr.company_size.value_counts()
50-99 3083 100-500 2571 10000+ 2019 49 1471 1000-4999 1328 <10 1308 500-999 877 5000-9999 563 Name: company_size, dtype: int64
hr.company_size.unique()
array([nan, '50-99', '<10', '10000+', '5000-9999', '1000-4999', '49',
'100-500', '500-999'], dtype=object)
hr[hr.company_size.isnull()]["company_type"].value_counts()
Other 5368 Pvt Ltd 406 Public Sector 129 NGO 26 Funded Startup 6 Early Stage Startup 3 Name: company_type, dtype: int64
hr.groupby("company_type")["company_size"].value_counts()
company_type company_size
Early Stage Startup <10 286
49 176
50-99 109
100-500 27
500-999 2
Funded Startup 50-99 390
100-500 214
49 193
<10 144
500-999 54
NGO 100-500 174
50-99 77
1000-4999 73
10000+ 37
500-999 37
<10 36
49 32
5000-9999 29
Other 50-99 240
100-500 172
49 117
10000+ 93
<10 79
500-999 75
1000-4999 74
5000-9999 43
Public Sector 1000-4999 165
100-500 151
10000+ 150
50-99 116
500-999 87
5000-9999 79
49 43
<10 35
Pvt Ltd 50-99 2151
100-500 1833
10000+ 1739
1000-4999 1016
49 910
<10 728
500-999 622
5000-9999 412
Name: company_size, dtype: int64
hr.groupby("company_type")["company_size"].count()
company_type Early Stage Startup 600 Funded Startup 995 NGO 495 Other 893 Public Sector 826 Pvt Ltd 9411 Name: company_size, dtype: int64
pd.crosstab(hr.company_type,hr.company_size).plot.bar()
plt.show()
sns.countplot(x="company_size", data=hr)
plt.xticks(rotation=90)
plt.show()
hr.company_size.fillna("50-99").value_counts().plot.bar()
plt.show()
# if we fill by 50-90 the count of 50-90 will drastically increase from 3000 to 8000
hr.loc[(hr.company_type=="Other") & (hr.company_size.isnull()), "company_size"]="50-99"
hr.loc[(hr.company_type=="Early Stage Startup") & (hr.company_size.isnull()), "company_size"]="<10"
hr.loc[(hr.company_type=="Funded Startup") & (hr.company_size.isnull()), "company_size"]="50-99"
hr.loc[(hr.company_type=="Public Sector") & (hr.company_size.isnull()), "company_size"]="1000-4999"
hr.loc[(hr.company_type=="NGO") & (hr.company_size.isnull()), "company_size"]="100-500"
hr.loc[(hr.company_type=="Pvt Ltd") & (hr.company_size.isnull()), "company_size"]="50-99"
hr.company_size.value_counts().plot.bar()
plt.show()
for i in hr.select_dtypes(include=np.number).columns:
plt.boxplot(hr[i])
plt.xlabel(i)
plt.show()
hr.select_dtypes(include="object").columns
Index(['city', 'gender', 'relevent_experience', 'enrolled_university',
'education_level', 'major_discipline', 'company_size', 'company_type',
'last_new_job', 'job_change'],
dtype='object')
def uni_cat(x):
sns.countplot(x=x, data=hr)
plt.show()
print(hr[x].value_counts(normalize=True)*100)
print("the category with highest number of frequency is ",hr[x].mode()[0])
uni_cat("job_change")
NO 75.065247 YES 24.934753 Name: job_change, dtype: float64 the category with highest number of frequency is NO
uni_cat("gender")
Male 84.173713 Female 14.046351 Other 1.779935 Name: gender, dtype: float64 the category with highest number of frequency is Male
hr.gender.value_counts(1)
Male 0.841737 Female 0.140464 Other 0.017799 Name: gender, dtype: float64
hr.groupby("gender")["job_change"].value_counts()
gender job_change
Female NO 1935
YES 756
Male NO 12208
YES 3918
Other NO 238
YES 103
Name: job_change, dtype: int64
uni_cat("relevent_experience")
Has relevent experience 71.990813 No relevent experience 28.009187 Name: relevent_experience, dtype: float64 the category with highest number of frequency is Has relevent experience
uni_cat("enrolled_university")
no_enrollment 74.136131 Full time course 19.610607 Part time course 6.253262 Name: enrolled_university, dtype: float64 the category with highest number of frequency is no_enrollment
uni_cat("education_level")
Graduate 60.538678 Masters 22.763336 High School 11.728782 Primary School 2.808226 Phd 2.160977 Name: education_level, dtype: float64 the category with highest number of frequency is Graduate
sns.countplot(x="major_discipline", data=hr)
plt.xticks(rotation=90)
plt.show()
print(hr["major_discipline"].value_counts(normalize=True)*100)
print("the category with highest number of frequency is ",hr["major_discipline"].mode()[0])
STEM 75.644639 No Major 15.847166 Humanities 3.492014 Other 1.988725 Business Degree 1.706859 Arts 1.320597 Name: major_discipline, dtype: float64 the category with highest number of frequency is STEM
sns.countplot(x="company_size", data=hr)
plt.xticks(rotation=90)
plt.show()
print(hr["company_size"].value_counts(normalize=True)*100)
print("the category with highest number of frequency is ",hr["company_size"].mode()[0])
50-99 46.262658 100-500 13.555695 10000+ 10.538678 49 7.678255 1000-4999 7.605178 <10 6.843094 500-999 4.577722 5000-9999 2.938720 Name: company_size, dtype: float64 the category with highest number of frequency is 50-99
sns.countplot(x="company_type", data=hr)
plt.xticks(rotation=90)
plt.show()
print(hr["company_type"].value_counts(normalize=True)*100)
print("the category with highest number of frequency is ",hr["company_type"].mode()[0])
Pvt Ltd 51.242301 Other 32.680864 Funded Startup 5.224971 Public Sector 4.984863 Early Stage Startup 3.147510 NGO 2.719491 Name: company_type, dtype: float64 the category with highest number of frequency is Pvt Ltd
hr.select_dtypes(exclude="object").columns
Index(['city_development_index', 'experience', 'training_hours'], dtype='object')
def uni_num(x):
sns.displot(hr[x])
plt.show()
sns.boxplot(x=x,data=hr)
plt.show()
print("skewness of the ",x,"column is ",hr[x].skew())
probplot(hr[x],plot=plt)
plt.show()
uni_num("experience")
skewness of the experience column is 0.3410292909488931
hr.experience.describe()
count 19158.000000 mean 9.921704 std 6.497186 min 0.000000 25% 4.000000 50% 9.000000 75% 16.000000 max 20.000000 Name: experience, dtype: float64
uni_num("training_hours")
skewness of the training_hours column is 1.8192372420221026
hr.training_hours.describe()
count 19158.000000 mean 65.366896 std 60.058462 min 1.000000 25% 23.000000 50% 47.000000 75% 88.000000 max 336.000000 Name: training_hours, dtype: float64
uni_num("city_development_index")
skewness of the city_development_index column is -0.9954275351977435
def binum(x,y):
sns.scatterplot(x=x,y=y,data=hr)
plt.show()
binum("experience","training_hours")
def catvnum(x,y):
sns.boxplot(x=x,y=y,data=hr)
plt.show()
catvnum("job_change","experience")
hr.groupby("job_change")["experience"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| job_change | ||||||||
| NO | 14381.0 | 10.588068 | 6.491032 | 0.0 | 5.0 | 10.0 | 17.0 | 20.0 |
| YES | 4777.0 | 7.915637 | 6.091035 | 0.0 | 3.0 | 6.0 | 11.0 | 20.0 |
hr.groupby("job_change")["experience"].skew()
job_change NO 0.208524 YES 0.798224 Name: experience, dtype: float64
sns.violinplot(x="job_change",y="training_hours",data=hr)
plt.show()
catvnum("job_change","city_development_index")
plt.bar(hr["job_change"], hr["city_development_index"])
plt.show()
hr.groupby("job_change")["city_development_index"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| job_change | ||||||||
| NO | 14381.0 | 0.853139 | 0.105354 | 0.448 | 0.804 | 0.91 | 0.92 | 0.949 |
| YES | 4777.0 | 0.755719 | 0.143166 | 0.448 | 0.624 | 0.74 | 0.92 | 0.949 |
catvnum("education_level","experience")
hr.groupby("education_level")["experience"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| education_level | ||||||||
| Graduate | 11598.0 | 9.811433 | 6.384456 | 0.0 | 4.0 | 8.0 | 15.0 | 20.0 |
| High School | 2247.0 | 6.512239 | 5.479965 | 0.0 | 3.0 | 5.0 | 9.0 | 20.0 |
| Masters | 4361.0 | 11.860812 | 6.298473 | 0.0 | 7.0 | 11.0 | 19.0 | 20.0 |
| Phd | 414.0 | 15.917874 | 5.580407 | 0.0 | 11.0 | 20.0 | 20.0 | 20.0 |
| Primary School | 538.0 | 6.206320 | 5.746037 | 0.0 | 2.0 | 4.0 | 8.0 | 20.0 |
catvnum("major_discipline","experience")
hr.groupby("major_discipline")["experience"].describe()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| major_discipline | ||||||||
| Arts | 253.0 | 10.288538 | 6.890855 | 0.0 | 4.0 | 10.0 | 17.0 | 20.0 |
| Business Degree | 327.0 | 10.727829 | 7.460073 | 0.0 | 3.5 | 10.0 | 20.0 | 20.0 |
| Humanities | 669.0 | 10.168909 | 7.397393 | 0.0 | 3.0 | 9.0 | 19.0 | 20.0 |
| No Major | 3036.0 | 6.798419 | 5.789004 | 0.0 | 3.0 | 5.0 | 9.0 | 20.0 |
| Other | 381.0 | 10.149606 | 6.822654 | 0.0 | 4.0 | 9.0 | 17.0 | 20.0 |
| STEM | 14492.0 | 10.534019 | 6.366014 | 0.0 | 5.0 | 9.0 | 16.0 | 20.0 |
cdi = hr.sort_values(by='city_development_index', ascending=True)[0:2000]
figure = plt.figure(figsize=(10,6))
sns.barplot(y=cdi.city, x=hr.city_development_index)
plt.xticks()
plt.xlabel('city_development_index')
plt.ylabel('city')
plt.title('City by city development index')
plt.show()
def crosstab(x,y):
print(pd.crosstab(x,y,normalize="index")*100)
pd.crosstab(x,y,normalize="index").plot(kind='bar')
crosstab(hr.gender,hr.job_change)
job_change NO YES gender Female 71.906355 28.093645 Male 75.703832 24.296168 Other 69.794721 30.205279
crosstab(hr.relevent_experience,hr.job_change)
job_change NO YES relevent_experience Has relevent experience 78.531032 21.468968 No relevent experience 66.157287 33.842713
crosstab(hr.enrolled_university,hr.job_change)
job_change NO YES enrolled_university Full time course 61.911099 38.088901 Part time course 74.791319 25.208681 no_enrollment 78.567908 21.432092
crosstab(hr.major_discipline,hr.job_change,)
job_change NO YES major_discipline Arts 79.051383 20.948617 Business Degree 73.700306 26.299694 Humanities 78.923767 21.076233 No Major 80.105402 19.894598 Other 73.228346 26.771654 STEM 73.840740 26.159260
crosstab(hr.education_level,hr.job_change)
job_change NO YES education_level Graduate 72.021038 27.978962 High School 79.839786 20.160214 Masters 78.559963 21.440037 Phd 85.990338 14.009662 Primary School 84.014870 15.985130
plt.figure(figsize=[20,20])
hr.groupby(["major_discipline","education_level"])["job_change"].value_counts(normalize=True).unstack().plot(kind="bar")
plt.show()
<Figure size 1440x1440 with 0 Axes>
crosstab(hr.company_size,hr.job_change)
job_change NO YES company_size 100-500 83.634963 16.365037 1000-4999 82.566918 17.433082 10000+ 80.931154 19.068846 49 76.614548 23.385452 50-99 67.403814 32.596186 500-999 82.668187 17.331813 5000-9999 81.882771 18.117229 <10 82.761251 17.238749
crosstab(hr.company_type,hr.job_change)
job_change NO YES company_type Early Stage Startup 76.451078 23.548922 Funded Startup 86.013986 13.986014 NGO 81.381958 18.618042 Other 61.459831 38.540169 Public Sector 78.010471 21.989529 Pvt Ltd 81.919120 18.080880
crosstab(hr.last_new_job,hr.job_change)
job_change NO YES last_new_job 1 73.569652 26.430348 2 75.862069 24.137931 3 77.441406 22.558594 4 77.842566 22.157434 >4 81.762918 18.237082 never 68.939130 31.060870
hr.groupby(["major_discipline"])["training_hours"].mean()
major_discipline Arts 60.272727 Business Degree 66.189602 Humanities 65.635277 No Major 66.324111 Other 66.049869 STEM 65.206390 Name: training_hours, dtype: float64
hr.groupby(["education_level",])["training_hours"].mean()
education_level Graduate 65.773409 High School 67.120605 Masters 63.270809 Phd 67.524155 Primary School 64.609665 Name: training_hours, dtype: float64
pd.crosstab([hr.major_discipline,hr.education_level],hr.last_new_job,normalize="index").plot.bar()
plt.show()
hr.head()
| city | city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | training_hours | job_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | city_103 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 50-99 | Other | 1 | 36 | YES |
| 1 | city_40 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | 47 | NO |
| 2 | city_21 | 0.624 | Male | No relevent experience | Full time course | Graduate | STEM | 5.0 | 50-99 | Other | never | 83 | NO |
| 3 | city_115 | 0.789 | Male | No relevent experience | no_enrollment | Graduate | Business Degree | 0.0 | 50-99 | Pvt Ltd | never | 52 | YES |
| 4 | city_162 | 0.767 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 50-99 | Funded Startup | 4 | 8 | NO |
pd.options.display.max_columns=None
plt.figure(figsize=(10,10))
sns.heatmap(hr.corr(),annot=True)
<AxesSubplot:>
sns.pairplot(hr, diag_kind="kde")
plt.show()
hr.std()
city_development_index 0.123362 experience 6.497186 training_hours 60.058462 dtype: float64
hr.describe(include="object")
| city | gender | relevent_experience | enrolled_university | education_level | major_discipline | company_size | company_type | last_new_job | job_change | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 19158 | 19158 | 19158 | 19158 | 19158 | 19158 | 19158 | 19158 | 19158 | 19158 |
| unique | 123 | 3 | 2 | 3 | 5 | 6 | 8 | 6 | 6 | 2 |
| top | city_103 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 50-99 | Pvt Ltd | 1 | NO |
| freq | 4355 | 16126 | 13792 | 14203 | 11598 | 14492 | 8863 | 9817 | 8040 | 14381 |
hr.drop("city",inplace=True, axis=1)
H0: Cat columns are independant
H1: Cat columsn are dependant
from scipy.stats import chi2_contingency
cat= hr.drop("job_change", axis=1).select_dtypes(include="object").columns
cat
Index(['gender', 'relevent_experience', 'enrolled_university',
'education_level', 'major_discipline', 'company_size', 'company_type',
'last_new_job'],
dtype='object')
for i in cat:
obs= pd.crosstab(hr[i],hr.job_change)
sts, pvalue, ddof, exp= chi2_contingency(obs)
print(i,"is", pvalue)
gender is 1.0541322646977614e-05 relevent_experience is 1.5006628411178982e-70 enrolled_university is 2.267945402973493e-96 education_level is 3.9147580257452785e-34 major_discipline is 6.56029585540239e-12 company_size is 1.5236345116667286e-113 company_type is 3.2029488094685203e-202 last_new_job is 1.5317467805104205e-31
H0: mu1 = mu2
H1: mu1 <> mu2
y=hr[hr["job_change"]=="YES"]["training_hours"]
n=hr[hr["job_change"]=="NO"]["training_hours"]
from scipy import stats
stats, pvalue= stats.ttest_ind(y,n)
if pvalue<0.05:
print("reject null hypothesis")
else:
print("fail to reject null hypothesis")
reject null hypothesis
H0: mu=8
H1: mu<>8
samp=hr.loc[hr.job_change=="YES","experience"]
samp.mean()
7.915637429348964
from scipy import stats
t_stat, p_val = stats.ttest_1samp(samp, popmean = 8)
if p_val<0.05:
print("reject null hypothesis")
else:
print("fail to reject null hypothesis")
fail to reject null hypothesis
hr.gender.value_counts()
Male 16126 Female 2691 Other 341 Name: gender, dtype: int64
hr.select_dtypes(include="object").columns
Index(['gender', 'relevent_experience', 'enrolled_university',
'education_level', 'major_discipline', 'company_size', 'company_type',
'last_new_job', 'job_change'],
dtype='object')
hr.corr()
| city_development_index | experience | training_hours | |
|---|---|---|---|
| city_development_index | 1.000000 | 0.333866 | 0.001920 |
| experience | 0.333866 | 1.000000 | 0.001326 |
| training_hours | 0.001920 | 0.001326 | 1.000000 |
from scipy import stats
z= stats.zscore(hr.training_hours)
hr[z>3]["training_hours"].min()
246
encode=pd.get_dummies(hr[['gender', 'relevent_experience',
'enrolled_university', 'major_discipline', 'company_type']],drop_first=True)
len(encode.columns)
15
hr.head()
| city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | training_hours | job_change | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 50-99 | Other | 1 | 36 | YES |
| 1 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | 47 | NO |
| 2 | 0.624 | Male | No relevent experience | Full time course | Graduate | STEM | 5.0 | 50-99 | Other | never | 83 | NO |
| 3 | 0.789 | Male | No relevent experience | no_enrollment | Graduate | Business Degree | 0.0 | 50-99 | Pvt Ltd | never | 52 | YES |
| 4 | 0.767 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 50-99 | Funded Startup | 4 | 8 | NO |
hr=pd.concat([hr,encode],axis=1)
hr.head(2)
| city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | ... | company_type_Funded Startup | company_type_NGO | company_type_Other | company_type_Public Sector | company_type_Pvt Ltd | last_new_job_2 | last_new_job_3 | last_new_job_4 | last_new_job_>4 | last_new_job_never | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 50-99 | Other | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
2 rows × 43 columns
hr.head(2)
| city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | training_hours | job_change | gender_Male | gender_Other | relevent_experience_No relevent experience | enrolled_university_Part time course | enrolled_university_no_enrollment | major_discipline_Business Degree | major_discipline_Humanities | major_discipline_No Major | major_discipline_Other | major_discipline_STEM | company_type_Funded Startup | company_type_NGO | company_type_Other | company_type_Public Sector | company_type_Pvt Ltd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 50-99 | Other | 1 | 36 | YES | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | 47 | NO | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
encode=pd.get_dummies(hr[['gender', 'relevent_experience',
'enrolled_university', 'education_level', 'major_discipline', 'company_size', 'company_type', 'last_new_job']],
drop_first=True)
hr.drop(['gender', 'relevent_experience',
'enrolled_university', 'education_level', 'major_discipline', 'company_size', 'company_type', 'last_new_job'],
axis=1,inplace=True)
hr.job_change.replace({"YES":1,"NO":0},inplace=True)
hr=pd.concat([hr,encode],axis=1)
hr.head()
| city_development_index | gender | relevent_experience | enrolled_university | education_level | major_discipline | experience | company_size | company_type | last_new_job | ... | company_type_Funded Startup | company_type_NGO | company_type_Other | company_type_Public Sector | company_type_Pvt Ltd | last_new_job_2 | last_new_job_3 | last_new_job_4 | last_new_job_>4 | last_new_job_never | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.920 | Male | Has relevent experience | no_enrollment | Graduate | STEM | 20.0 | 50-99 | Other | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0.776 | Male | No relevent experience | no_enrollment | Graduate | STEM | 15.0 | 50-99 | Pvt Ltd | >4 | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 2 | 0.624 | Male | No relevent experience | Full time course | Graduate | STEM | 5.0 | 50-99 | Other | never | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 0.789 | Male | No relevent experience | NaN | Graduate | Business Degree | 0.0 | 50-99 | Pvt Ltd | never | ... | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| 4 | 0.767 | Male | Has relevent experience | no_enrollment | Masters | STEM | 20.0 | 50-99 | Funded Startup | 4 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 |
5 rows × 43 columns
hr.shape
(19158, 35)
pd.options.display.max_columns=None
plt.figure(figsize=(30,30))
sns.heatmap(hr.corr(),annot=True)
<AxesSubplot:>
#job change correlation
hr.corr()["job_change"]
city_development_index -0.341665 experience -0.177957 training_hours -0.021577 job_change 1.000000 gender_Male -0.034040 gender_Other 0.016400 relevent_experience_No relevent experience 0.128430 enrolled_university_Part time course 0.001635 enrolled_university_no_enrollment -0.137071 major_discipline_Business Degree 0.004157 major_discipline_Humanities -0.016965 major_discipline_No Major -0.050555 major_discipline_Other 0.006048 major_discipline_STEM 0.049881 company_type_Funded Startup -0.059421 company_type_NGO -0.024412 company_type_Other 0.219113 company_type_Public Sector -0.015593 company_type_Pvt Ltd -0.162408 Name: job_change, dtype: float64
# all libraries
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score,roc_curve,auc,precision_score,recall_score,confusion_matrix,classification_report,f1_score
hr.head(2)
| city_development_index | experience | training_hours | job_change | gender_Male | gender_Other | relevent_experience_No relevent experience | enrolled_university_Part time course | enrolled_university_no_enrollment | major_discipline_Business Degree | major_discipline_Humanities | major_discipline_No Major | major_discipline_Other | major_discipline_STEM | company_type_Funded Startup | company_type_NGO | company_type_Other | company_type_Public Sector | company_type_Pvt Ltd | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.920 | 20.0 | 36 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
| 1 | 0.776 | 15.0 | 47 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
#dataframe for all the data collected
Score=pd.DataFrame(columns=["Name","Train Score","Test Score","Precision Score","Recall Score","F1-Score","ROC - AUC","TN","FP","FN","TP"])
CV_Score=pd.DataFrame(columns=["Name","CV-Train Score","CV-Test Score","CV-Precision Score","CV-Recall Score","CV-F1-Score","CV-ROC - AUC"])
Score.head(1)
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP |
|---|
X=hr.drop("job_change",axis=1)
y=hr.job_change
lr=LogisticRegression()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
proba=lr.predict_proba(X_test)[:,1]
print("trian score : ",lr.score(X_train,y_train))
print("test score : ",lr.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("f1 score : ",f1_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr0,tpr0,thresh=roc_curve(y_test,proba)
trian score : 0.766801513767454
test score : 0.7656576200417536
confusion matrix :
[[2719 184]
[ 714 215]]
precision score : 0.5388471177944862
recall score : 0.23143164693218515
f1 score : 0.3237951807228916
auc : 0.7610737861838482
classification report :
precision recall f1-score support
0 0.79 0.94 0.86 2903
1 0.54 0.23 0.32 929
accuracy 0.77 3832
macro avg 0.67 0.58 0.59 3832
weighted avg 0.73 0.77 0.73 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"Logistic Regression",
'Train Score':lr.score(X_train,y_train),
'Test Score':lr.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
ser=pd.Series(np.exp(lr.coef_).tolist()[0],index=X_train.columns).sort_values(ascending=False)
ser
company_type_Other 3.655468 major_discipline_Business Degree 1.954555 company_type_Public Sector 1.863883 major_discipline_Other 1.859301 major_discipline_STEM 1.839640 major_discipline_Humanities 1.673715 gender_Other 1.296452 relevent_experience_No relevent experience 1.273461 company_type_Pvt Ltd 1.214197 company_type_NGO 1.031106 training_hours 0.999260 experience 0.974609 gender_Male 0.966289 enrolled_university_no_enrollment 0.884839 enrolled_university_Part time course 0.830448 company_type_Funded Startup 0.782123 major_discipline_No Major 0.694204 city_development_index 0.005541 dtype: float64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
from sklearn.tree import DecisionTreeClassifier
X=hr.drop("job_change",axis=1)
y=hr.job_change
dt=DecisionTreeClassifier(random_state=10)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
proba=dt.predict_proba(X_test)[:,1]
print("trian score : ",dt.score(X_train,y_train))
print("test score : ",dt.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr2,tpr2,thresh=roc_curve(y_test,proba)
trian score : 0.9865587889860368
test score : 0.7168580375782881
confusion matrix :
[[2358 545]
[ 540 389]]
precision score : 0.4164882226980728
recall score : 0.418729817007535
auc : 0.6198099883309904
classification report :
precision recall f1-score support
0 0.81 0.81 0.81 2903
1 0.42 0.42 0.42 929
accuracy 0.72 3832
macro avg 0.62 0.62 0.62 3832
weighted avg 0.72 0.72 0.72 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"Decision Tree ",
'Train Score':dt.score(X_train,y_train),
'Test Score':dt.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
ser=pd.Series(dt.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
ser
training_hours 0.330308 city_development_index 0.274543 experience 0.148541 company_type_Other 0.064683 gender_Male 0.027189 enrolled_university_no_enrollment 0.026017 relevent_experience_No relevent experience 0.025953 major_discipline_No Major 0.024054 major_discipline_STEM 0.015944 company_type_Pvt Ltd 0.014491 enrolled_university_Part time course 0.009937 major_discipline_Humanities 0.007016 company_type_Funded Startup 0.006648 company_type_NGO 0.006305 major_discipline_Other 0.005372 gender_Other 0.005045 company_type_Public Sector 0.003989 major_discipline_Business Degree 0.003964 dtype: float64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
from sklearn.ensemble import RandomForestClassifier
X=hr.drop("job_change",axis=1)
y=hr.job_change
rf=RandomForestClassifier(random_state=10)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
proba=rf.predict_proba(X_test)[:,1]
print("trian score : ",rf.score(X_train,y_train))
print("test score : ",rf.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr4,tpr4,thresh=roc_curve(y_test,proba)
trian score : 0.9865587889860368
test score : 0.7669624217118998
confusion matrix :
[[2545 358]
[ 535 394]]
precision score : 0.523936170212766
recall score : 0.4241119483315393
auc : 0.757277742819777
classification report :
precision recall f1-score support
0 0.83 0.88 0.85 2903
1 0.52 0.42 0.47 929
accuracy 0.77 3832
macro avg 0.68 0.65 0.66 3832
weighted avg 0.75 0.77 0.76 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"Random Forest Classifier ",
'Train Score':rf.score(X_train,y_train),
'Test Score':rf.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
ser=pd.Series(rf.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
ser
training_hours 0.369593 city_development_index 0.286564 experience 0.164966 company_type_Other 0.040367 relevent_experience_No relevent experience 0.019956 enrolled_university_no_enrollment 0.018712 gender_Male 0.017859 company_type_Pvt Ltd 0.017229 major_discipline_No Major 0.013955 major_discipline_STEM 0.012226 enrolled_university_Part time course 0.008244 company_type_Public Sector 0.005140 company_type_Funded Startup 0.005032 gender_Other 0.004940 major_discipline_Humanities 0.004321 major_discipline_Business Degree 0.003698 company_type_NGO 0.003663 major_discipline_Other 0.003534 dtype: float64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
from sklearn.ensemble import AdaBoostClassifier
X=hr.drop("job_change",axis=1)
y=hr.job_change
ad=AdaBoostClassifier(random_state=10)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
ad.fit(X_train,y_train)
y_pred=ad.predict(X_test)
proba=ad.predict_proba(X_test)[:,1]
print("trian score : ",ad.score(X_train,y_train))
print("test score : ",ad.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr6,tpr6,thresh=roc_curve(y_test,proba)
trian score : 0.7765888033407282
test score : 0.7784446764091858
confusion matrix :
[[2712 191]
[ 658 271]]
precision score : 0.5865800865800865
recall score : 0.2917115177610334
auc : 0.7849913622632316
classification report :
precision recall f1-score support
0 0.80 0.93 0.86 2903
1 0.59 0.29 0.39 929
accuracy 0.78 3832
macro avg 0.70 0.61 0.63 3832
weighted avg 0.75 0.78 0.75 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"AdaBoost Classifier ",
'Train Score':ad.score(X_train,y_train),
'Test Score':ad.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
| 3 | AdaBoost Classifier | 0.776589 | 0.778445 | 0.586580 | 0.291712 | 0.389648 | 0.784991 | 2712 | 191 | 658 | 271 |
ser=pd.Series(ad.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
ser
city_development_index 0.38 training_hours 0.34 experience 0.06 company_type_Other 0.04 major_discipline_No Major 0.04 enrolled_university_no_enrollment 0.04 company_type_Public Sector 0.02 gender_Male 0.02 relevent_experience_No relevent experience 0.02 major_discipline_Business Degree 0.02 company_type_Funded Startup 0.02 major_discipline_Humanities 0.00 enrolled_university_Part time course 0.00 major_discipline_Other 0.00 gender_Other 0.00 major_discipline_STEM 0.00 company_type_NGO 0.00 company_type_Pvt Ltd 0.00 dtype: float64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
from sklearn.ensemble import GradientBoostingClassifier
X=hr.drop("job_change",axis=1)
y=hr.job_change
gb=GradientBoostingClassifier(random_state=10)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
gb.fit(X_train,y_train)
y_pred=gb.predict(X_test)
proba=gb.predict_proba(X_test)[:,1]
print("trian score : ",gb.score(X_train,y_train))
print("test score : ",gb.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr7,tpr7,thresh=roc_curve(y_test,proba)
trian score : 0.796359128278742
test score : 0.7883611691022965
confusion matrix :
[[2576 327]
[ 484 445]]
precision score : 0.5764248704663213
recall score : 0.4790096878363832
auc : 0.7874182344310311
classification report :
precision recall f1-score support
0 0.84 0.89 0.86 2903
1 0.58 0.48 0.52 929
accuracy 0.79 3832
macro avg 0.71 0.68 0.69 3832
weighted avg 0.78 0.79 0.78 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"Gradiant Boosting Classifier ",
'Train Score':gb.score(X_train,y_train),
'Test Score':gb.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
| 3 | AdaBoost Classifier | 0.776589 | 0.778445 | 0.586580 | 0.291712 | 0.389648 | 0.784991 | 2712 | 191 | 658 | 271 |
| 4 | Gradiant Boosting Classifier | 0.796359 | 0.788361 | 0.576425 | 0.479010 | 0.523222 | 0.787418 | 2576 | 327 | 484 | 445 |
ser=pd.Series(gb.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
ser
city_development_index 0.618834 company_type_Other 0.224642 major_discipline_No Major 0.067555 training_hours 0.022196 experience 0.022056 relevent_experience_No relevent experience 0.016143 enrolled_university_no_enrollment 0.010922 company_type_Public Sector 0.006387 major_discipline_STEM 0.002638 major_discipline_Business Degree 0.001581 company_type_Funded Startup 0.001568 company_type_Pvt Ltd 0.001406 major_discipline_Other 0.001293 gender_Other 0.000979 enrolled_university_Part time course 0.000932 gender_Male 0.000601 company_type_NGO 0.000186 major_discipline_Humanities 0.000083 dtype: float64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
! pip install xgboost
from xgboost import XGBClassifier
Requirement already satisfied: xgboost in c:\users\rawat\anaconda3\lib\site-packages (1.4.2) Requirement already satisfied: numpy in c:\users\rawat\anaconda3\lib\site-packages (from xgboost) (1.19.2) Requirement already satisfied: scipy in c:\users\rawat\anaconda3\lib\site-packages (from xgboost) (1.5.2)
hr.columns
Index(['city_development_index', 'experience', 'training_hours', 'job_change',
'gender_Male', 'gender_Other',
'relevent_experience_No relevent experience',
'enrolled_university_Part time course',
'enrolled_university_no_enrollment', 'major_discipline_Business Degree',
'major_discipline_Humanities', 'major_discipline_No Major',
'major_discipline_Other', 'major_discipline_STEM',
'company_type_Funded Startup', 'company_type_NGO', 'company_type_Other',
'company_type_Public Sector', 'company_type_Pvt Ltd'],
dtype='object')
hr.columns=hr.rename(columns={"last_new_job_>4":"last_new_job_more_than_4",'company_size_<10':'company_size_less_than_10'}).columns
hr.columns
Index(['city_development_index', 'experience', 'training_hours', 'job_change',
'gender_Male', 'gender_Other',
'relevent_experience_No relevent experience',
'enrolled_university_Part time course',
'enrolled_university_no_enrollment', 'major_discipline_Business Degree',
'major_discipline_Humanities', 'major_discipline_No Major',
'major_discipline_Other', 'major_discipline_STEM',
'company_type_Funded Startup', 'company_type_NGO', 'company_type_Other',
'company_type_Public Sector', 'company_type_Pvt Ltd'],
dtype='object')
X=hr.drop("job_change",axis=1)
y=hr.job_change
xg=XGBClassifier(random_state=10)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
xg.fit(X_train,y_train)
y_pred=xg.predict(X_test)
proba=xg.predict_proba(X_test)[:,1]
print("trian score : ",xg.score(X_train,y_train))
print("test score : ",xg.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr8,tpr8,thresh=roc_curve(y_test,proba)
[21:53:51] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
trian score : 0.8618034712253686
test score : 0.7732254697286013
confusion matrix :
[[2534 369]
[ 500 429]]
precision score : 0.5375939849624061
recall score : 0.46178686759956944
auc : 0.7733347745011192
classification report :
precision recall f1-score support
0 0.84 0.87 0.85 2903
1 0.54 0.46 0.50 929
accuracy 0.77 3832
macro avg 0.69 0.67 0.68 3832
weighted avg 0.76 0.77 0.77 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"XGBoost Classifier ",
'Train Score':xg.score(X_train,y_train),
'Test Score':xg.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
| 3 | AdaBoost Classifier | 0.776589 | 0.778445 | 0.586580 | 0.291712 | 0.389648 | 0.784991 | 2712 | 191 | 658 | 271 |
| 4 | Gradiant Boosting Classifier | 0.796359 | 0.788361 | 0.576425 | 0.479010 | 0.523222 | 0.787418 | 2576 | 327 | 484 | 445 |
| 5 | XGBoost Classifier | 0.861803 | 0.773225 | 0.537594 | 0.461787 | 0.496815 | 0.773335 | 2534 | 369 | 500 | 429 |
ser=pd.Series(xg.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
ser
company_type_Other 0.267534 major_discipline_No Major 0.153330 city_development_index 0.120927 company_type_Public Sector 0.047753 relevent_experience_No relevent experience 0.046874 enrolled_university_Part time course 0.032464 enrolled_university_no_enrollment 0.031898 gender_Male 0.031081 major_discipline_STEM 0.030575 experience 0.030391 major_discipline_Humanities 0.029766 company_type_Pvt Ltd 0.028541 training_hours 0.028363 major_discipline_Business Degree 0.026742 company_type_Funded Startup 0.026562 gender_Other 0.024953 major_discipline_Other 0.021792 company_type_NGO 0.020452 dtype: float64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
from lightgbm import LGBMClassifier
X=hr.drop("job_change",axis=1)
y=hr.job_change
lgbm=LGBMClassifier(random_state=10)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
lgbm.fit(X_train,y_train)
y_pred=lgbm.predict(X_test)
proba=lgbm.predict_proba(X_test)[:,1]
print("trian score : ",lgbm.score(X_train,y_train))
print("test score : ",lgbm.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr9,tpr9,thresh=roc_curve(y_test,proba)
trian score : 0.821479838183479
test score : 0.7834029227557411
confusion matrix :
[[2511 392]
[ 438 491]]
precision score : 0.5560588901472253
recall score : 0.5285252960172229
auc : 0.7878240727179151
classification report :
precision recall f1-score support
0 0.85 0.86 0.86 2903
1 0.56 0.53 0.54 929
accuracy 0.78 3832
macro avg 0.70 0.70 0.70 3832
weighted avg 0.78 0.78 0.78 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"Light GBM Classifier ",
'Train Score':lgbm.score(X_train,y_train),
'Test Score':lgbm.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
| 3 | AdaBoost Classifier | 0.776589 | 0.778445 | 0.586580 | 0.291712 | 0.389648 | 0.784991 | 2712 | 191 | 658 | 271 |
| 4 | Gradiant Boosting Classifier | 0.796359 | 0.788361 | 0.576425 | 0.479010 | 0.523222 | 0.787418 | 2576 | 327 | 484 | 445 |
| 5 | XGBoost Classifier | 0.861803 | 0.773225 | 0.537594 | 0.461787 | 0.496815 | 0.773335 | 2534 | 369 | 500 | 429 |
| 6 | Light GBM Classifier | 0.821480 | 0.783403 | 0.556059 | 0.528525 | 0.541943 | 0.787824 | 2511 | 392 | 438 | 491 |
ser=pd.Series(lgbm.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
ser
training_hours 1088 city_development_index 670 experience 589 relevent_experience_No relevent experience 92 company_type_Other 91 enrolled_university_no_enrollment 78 gender_Male 64 major_discipline_STEM 54 major_discipline_No Major 49 company_type_Pvt Ltd 48 company_type_Public Sector 34 company_type_Funded Startup 33 major_discipline_Business Degree 26 enrolled_university_Part time course 24 major_discipline_Humanities 21 gender_Other 17 major_discipline_Other 17 company_type_NGO 5 dtype: int64
import seaborn as sns
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
from sklearn.neighbors import KNeighborsClassifier
X=hr.drop("job_change",axis=1)
y=hr.job_change
knn=KNeighborsClassifier()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
proba=knn.predict_proba(X_test)[:,1]
print("trian score : ",knn.score(X_train,y_train))
print("test score : ",knn.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr10,tpr10,thresh=roc_curve(y_test,proba)
trian score : 0.8114315542215842
test score : 0.7372129436325678
confusion matrix :
[[2594 309]
[ 698 231]]
precision score : 0.42777777777777776
recall score : 0.24865446716899892
auc : 0.6434911807576662
classification report :
precision recall f1-score support
0 0.79 0.89 0.84 2903
1 0.43 0.25 0.31 929
accuracy 0.74 3832
macro avg 0.61 0.57 0.58 3832
weighted avg 0.70 0.74 0.71 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"K Nearest Neighbor Classifier ",
'Train Score':knn.score(X_train,y_train),
'Test Score':knn.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
| 3 | AdaBoost Classifier | 0.776589 | 0.778445 | 0.586580 | 0.291712 | 0.389648 | 0.784991 | 2712 | 191 | 658 | 271 |
| 4 | Gradiant Boosting Classifier | 0.796359 | 0.788361 | 0.576425 | 0.479010 | 0.523222 | 0.787418 | 2576 | 327 | 484 | 445 |
| 5 | XGBoost Classifier | 0.861803 | 0.773225 | 0.537594 | 0.461787 | 0.496815 | 0.773335 | 2534 | 369 | 500 | 429 |
| 6 | Light GBM Classifier | 0.821480 | 0.783403 | 0.556059 | 0.528525 | 0.541943 | 0.787824 | 2511 | 392 | 438 | 491 |
| 7 | K Nearest Neighbor Classifier | 0.811432 | 0.737213 | 0.427778 | 0.248654 | 0.314500 | 0.643491 | 2594 | 309 | 698 | 231 |
from sklearn.naive_bayes import GaussianNB
X=hr.drop("job_change",axis=1)
y=hr.job_change
nb=GaussianNB()
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
nb.fit(X_train,y_train)
y_pred=nb.predict(X_test)
proba=nb.predict_proba(X_test)[:,1]
print("trian score : ",nb.score(X_train,y_train))
print("test score : ",nb.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr11,tpr11,thresh=roc_curve(y_test,proba)
trian score : 0.7398538431423725
test score : 0.7239039665970772
confusion matrix :
[[2235 668]
[ 390 539]]
precision score : 0.44656172328086163
recall score : 0.5801937567276642
auc : 0.7055801744752375
classification report :
precision recall f1-score support
0 0.85 0.77 0.81 2903
1 0.45 0.58 0.50 929
accuracy 0.72 3832
macro avg 0.65 0.68 0.66 3832
weighted avg 0.75 0.72 0.73 3832
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":"Naive Bayes Classifier ",
'Train Score':nb.score(X_train,y_train),
'Test Score':nb.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP},ignore_index=True)
# this table represents all the scores of the models found above
Score
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Logistic Regression | 0.766802 | 0.765658 | 0.538847 | 0.231432 | 0.323795 | 0.761074 | 2719 | 184 | 714 | 215 |
| 1 | Decision Tree | 0.986559 | 0.716858 | 0.416488 | 0.418730 | 0.417606 | 0.619810 | 2358 | 545 | 540 | 389 |
| 2 | Random Forest Classifier | 0.986559 | 0.766962 | 0.523936 | 0.424112 | 0.468769 | 0.757278 | 2545 | 358 | 535 | 394 |
| 3 | AdaBoost Classifier | 0.776589 | 0.778445 | 0.586580 | 0.291712 | 0.389648 | 0.784991 | 2712 | 191 | 658 | 271 |
| 4 | Gradiant Boosting Classifier | 0.796359 | 0.788361 | 0.576425 | 0.479010 | 0.523222 | 0.787418 | 2576 | 327 | 484 | 445 |
| 5 | XGBoost Classifier | 0.861803 | 0.773225 | 0.537594 | 0.461787 | 0.496815 | 0.773335 | 2534 | 369 | 500 | 429 |
| 6 | Light GBM Classifier | 0.821480 | 0.783403 | 0.556059 | 0.528525 | 0.541943 | 0.787824 | 2511 | 392 | 438 | 491 |
| 7 | K Nearest Neighbor Classifier | 0.811432 | 0.737213 | 0.427778 | 0.248654 | 0.314500 | 0.643491 | 2594 | 309 | 698 | 231 |
| 8 | Naive Bayes Classifier | 0.739854 | 0.723904 | 0.446562 | 0.580194 | 0.504682 | 0.705580 | 2235 | 668 | 390 | 539 |
plt.figure(figsize=(20,20))
plt.plot(fpr0,tpr0,label="{0} : {1:.3f}".format(Score["Name"][0],auc(fpr0,tpr0)))
plt.plot(fpr2,tpr2,label="{0} : {1:.3f}".format(Score["Name"][1],auc(fpr2,tpr2)))
plt.plot(fpr4,tpr4,label="{0} : {1:.3f}".format(Score["Name"][2],auc(fpr4,tpr4)))
plt.plot(fpr6,tpr6,label="{0} : {1:.3f}".format(Score["Name"][3],auc(fpr6,tpr6)))
plt.plot(fpr7,tpr7,label="{0} : {1:.3f}".format(Score["Name"][4],auc(fpr7,tpr7)))
plt.plot(fpr8,tpr8,label="{0} : {1:.3f}".format(Score["Name"][5],auc(fpr8,tpr8)))
plt.plot(fpr9,tpr9,label="{0} : {1:.3f}".format(Score["Name"][6],auc(fpr9,tpr9)))
plt.plot(fpr10,tpr10,label="{0} : {1:.3f}".format(Score["Name"][7],auc(fpr10,tpr10)))
plt.plot(fpr11,tpr11,label="{0} : {1:.3f}".format(Score["Name"][8],auc(fpr11,tpr11)))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate',size=20)
plt.ylabel('True Positive Rate',size=20)
plt.title('ROc Auc Curves Scores',size=20)
plt.legend(loc="lower right",prop={'size':20})
plt.show()
from imblearn.over_sampling import RandomOverSampler
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
X_train.shape
(15326, 18)
hr1=hr.copy()
os=RandomOverSampler(random_state=10)
X=hr1.drop("job_change",axis=1)
y=hr1["job_change"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)
X_train,y_train=os.fit_resample(X_train,y_train)
X_train.shape
(22956, 34)
# all libraries
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import validation_curve,KFold
warnings.filterwarnings("ignore")
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score,roc_curve,auc,precision_score,recall_score,confusion_matrix,classification_report,f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
#dataframe for all the data collected
Score=pd.DataFrame(columns=["Name","Train Score","Test Score","Precision Score","Recall Score","F1-Score","ROC - AUC","TN","FP","FN","TP","FPR","TPR"])
Score.head(1)
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | FPR | TPR |
|---|
a=['LogisticRegression()','DecisionTreeClassifier()','RandomForestClassifier()','XGBClassifier()','LGBMClassifier()',
'GaussianNB()','GradientBoostingClassifier()','AdaBoostClassifier()','KNeighborsClassifier()']
li=[LogisticRegression(),DecisionTreeClassifier(random_state=10),RandomForestClassifier(random_state=10),XGBClassifier(random_state=10)
,LGBMClassifier(random_state=10),
GaussianNB(),GradientBoostingClassifier(random_state=10),AdaBoostClassifier(random_state=10),KNeighborsClassifier()]
for i,j in zip(li,a):
pipe= Pipeline([['classifier', i]])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
proba=pipe.predict_proba(X_test)[:,1]
print(i)
print()
print()
print("train score : ",pipe.score(X_train,y_train))
print("test score : ",pipe.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr1,tpr1,thresh=roc_curve(y_test,proba)
TN=cm[0,0]
FP=cm[0,1]
FN=cm[1,0]
TP=cm[1,1]
Score = Score.append({"Name":j,
'Train Score':pipe.score(X_train,y_train),
'Test Score':pipe.score(X_test,y_test),
'Precision Score':precision_score(y_test,y_pred),
'Recall Score':recall_score(y_test,y_pred),
'F1-Score':f1_score(y_test,y_pred),
'ROC - AUC':roc_auc_score(y_test,proba),
'TN':TN,'FP':FP, 'FN':FN, 'TP':TP,"fpr":fpr1,"tpr":tpr1},ignore_index=True)
print()
print()
if i=='LogisticRegression()':
ser=pd.Series(np.exp(lr.coef_).tolist()[0],index=X_train.columns).sort_values(ascending=False)
print(ser)
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
plt.show()
print()
elif i in['KNeighborsClassifier()','GaussianNB()']:
print()
pass
else:
ser=pd.Series(dt.feature_importances_.tolist(),index=X_train.columns).sort_values(ascending=False)
print(ser)
plt.figure(figsize=(10,10))
sns.barplot(ser.values,ser.index);
plt.show()
print()
LogisticRegression()
train score : 0.7144101759888483
test score : 0.7038100208768268
confusion matrix :
[[2039 864]
[ 271 658]]
precision score : 0.4323258869908016
recall score : 0.7082884822389667
auc : 0.7600959921568831
classification report :
precision recall f1-score support
0 0.88 0.70 0.78 2903
1 0.43 0.71 0.54 929
accuracy 0.70 3832
macro avg 0.66 0.71 0.66 3832
weighted avg 0.77 0.70 0.72 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
DecisionTreeClassifier(random_state=10)
train score : 0.9888917929952954
test score : 0.7090292275574113
confusion matrix :
[[2320 583]
[ 532 397]]
precision score : 0.4051020408163265
recall score : 0.42734122712594186
auc : 0.6134359726603302
classification report :
precision recall f1-score support
0 0.81 0.80 0.81 2903
1 0.41 0.43 0.42 929
accuracy 0.71 3832
macro avg 0.61 0.61 0.61 3832
weighted avg 0.71 0.71 0.71 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
RandomForestClassifier(random_state=10)
train score : 0.9888917929952954
test score : 0.7531315240083507
confusion matrix :
[[2420 483]
[ 463 466]]
precision score : 0.4910432033719705
recall score : 0.5016146393972013
auc : 0.7474287947548415
classification report :
precision recall f1-score support
0 0.84 0.83 0.84 2903
1 0.49 0.50 0.50 929
accuracy 0.75 3832
macro avg 0.67 0.67 0.67 3832
weighted avg 0.75 0.75 0.75 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
[21:54:15] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.300000012, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=100, n_jobs=16, num_parallel_tree=1, random_state=10,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
train score : 0.8507144101759888
test score : 0.7580897703549061
confusion matrix :
[[2294 609]
[ 318 611]]
precision score : 0.5008196721311475
recall score : 0.6576964477933261
auc : 0.7723973974437934
classification report :
precision recall f1-score support
0 0.88 0.79 0.83 2903
1 0.50 0.66 0.57 929
accuracy 0.76 3832
macro avg 0.69 0.72 0.70 3832
weighted avg 0.79 0.76 0.77 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
LGBMClassifier(random_state=10)
train score : 0.7986147412441191
test score : 0.764874739039666
confusion matrix :
[[2267 636]
[ 265 664]]
precision score : 0.5107692307692308
recall score : 0.7147470398277718
auc : 0.7895290014005036
classification report :
precision recall f1-score support
0 0.90 0.78 0.83 2903
1 0.51 0.71 0.60 929
accuracy 0.76 3832
macro avg 0.70 0.75 0.72 3832
weighted avg 0.80 0.76 0.78 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
GaussianNB()
train score : 0.6882732183307196
test score : 0.6450939457202505
confusion matrix :
[[1815 1088]
[ 272 657]]
precision score : 0.3765042979942693
recall score : 0.7072120559741658
auc : 0.6997682884006634
classification report :
precision recall f1-score support
0 0.87 0.63 0.73 2903
1 0.38 0.71 0.49 929
accuracy 0.65 3832
macro avg 0.62 0.67 0.61 3832
weighted avg 0.75 0.65 0.67 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
GradientBoostingClassifier(random_state=10)
train score : 0.7680345007841087
test score : 0.7633089770354906
confusion matrix :
[[2253 650]
[ 257 672]]
precision score : 0.5083207261724659
recall score : 0.7233584499461787
auc : 0.7879738750641018
classification report :
precision recall f1-score support
0 0.90 0.78 0.83 2903
1 0.51 0.72 0.60 929
accuracy 0.76 3832
macro avg 0.70 0.75 0.71 3832
weighted avg 0.80 0.76 0.78 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
AdaBoostClassifier(random_state=10)
train score : 0.756577801010629
test score : 0.752348643006263
confusion matrix :
[[2211 692]
[ 257 672]]
precision score : 0.49266862170087977
recall score : 0.7233584499461787
auc : 0.7827836316464131
classification report :
precision recall f1-score support
0 0.90 0.76 0.82 2903
1 0.49 0.72 0.59 929
accuracy 0.75 3832
macro avg 0.69 0.74 0.70 3832
weighted avg 0.80 0.75 0.77 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
KNeighborsClassifier()
train score : 0.8299790904338735
test score : 0.6143006263048016
confusion matrix :
[[1838 1065]
[ 413 516]]
precision score : 0.32637571157495254
recall score : 0.5554359526372443
auc : 0.6296841506522148
classification report :
precision recall f1-score support
0 0.82 0.63 0.71 2903
1 0.33 0.56 0.41 929
accuracy 0.61 3832
macro avg 0.57 0.59 0.56 3832
weighted avg 0.70 0.61 0.64 3832
training_hours 0.330308
city_development_index 0.274543
experience 0.148541
company_type_Other 0.064683
gender_Male 0.027189
enrolled_university_no_enrollment 0.026017
relevent_experience_No relevent experience 0.025953
major_discipline_No Major 0.024054
major_discipline_STEM 0.015944
company_type_Pvt Ltd 0.014491
enrolled_university_Part time course 0.009937
major_discipline_Humanities 0.007016
company_type_Funded Startup 0.006648
company_type_NGO 0.006305
major_discipline_Other 0.005372
gender_Other 0.005045
company_type_Public Sector 0.003989
major_discipline_Business Degree 0.003964
dtype: float64
Score[["Name","Train Score","Test Score","Precision Score","Recall Score","F1-Score","ROC - AUC","TN","FP","FN","TP"]]
| Name | Train Score | Test Score | Precision Score | Recall Score | F1-Score | ROC - AUC | TN | FP | FN | TP | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LogisticRegression() | 0.714410 | 0.703810 | 0.432326 | 0.708288 | 0.536924 | 0.760096 | 2039 | 864 | 271 | 658 |
| 1 | DecisionTreeClassifier() | 0.988892 | 0.709029 | 0.405102 | 0.427341 | 0.415925 | 0.613436 | 2320 | 583 | 532 | 397 |
| 2 | RandomForestClassifier() | 0.988892 | 0.753132 | 0.491043 | 0.501615 | 0.496273 | 0.747429 | 2420 | 483 | 463 | 466 |
| 3 | XGBClassifier() | 0.850714 | 0.758090 | 0.500820 | 0.657696 | 0.568637 | 0.772397 | 2294 | 609 | 318 | 611 |
| 4 | LGBMClassifier() | 0.798615 | 0.764875 | 0.510769 | 0.714747 | 0.595783 | 0.789529 | 2267 | 636 | 265 | 664 |
| 5 | GaussianNB() | 0.688273 | 0.645094 | 0.376504 | 0.707212 | 0.491399 | 0.699768 | 1815 | 1088 | 272 | 657 |
| 6 | GradientBoostingClassifier() | 0.768035 | 0.763309 | 0.508321 | 0.723358 | 0.597068 | 0.787974 | 2253 | 650 | 257 | 672 |
| 7 | AdaBoostClassifier() | 0.756578 | 0.752349 | 0.492669 | 0.723358 | 0.586132 | 0.782784 | 2211 | 692 | 257 | 672 |
| 8 | KNeighborsClassifier() | 0.829979 | 0.614301 | 0.326376 | 0.555436 | 0.411155 | 0.629684 | 1838 | 1065 | 413 | 516 |
Score.iloc[0]["TP"]
658
plt.figure(figsize=(20,20))
plt.plot(Score.iloc[0]["fpr"],Score.iloc[0]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][0],auc(Score.iloc[0]["fpr"],Score.iloc[0]["tpr"])))
plt.plot(Score.iloc[1]["fpr"],Score.iloc[1]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][1],auc(Score.iloc[1]["fpr"],Score.iloc[1]["tpr"])))
plt.plot(Score.iloc[2]["fpr"],Score.iloc[2]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][2],auc(Score.iloc[2]["fpr"],Score.iloc[2]["tpr"])))
plt.plot(Score.iloc[3]["fpr"],Score.iloc[3]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][3],auc(Score.iloc[3]["fpr"],Score.iloc[3]["tpr"])))
plt.plot(Score.iloc[4]["fpr"],Score.iloc[4]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][4],auc(Score.iloc[4]["fpr"],Score.iloc[4]["tpr"])))
plt.plot(Score.iloc[5]["fpr"],Score.iloc[5]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][5],auc(Score.iloc[5]["fpr"],Score.iloc[5]["tpr"])))
plt.plot(Score.iloc[6]["fpr"],Score.iloc[6]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][6],auc(Score.iloc[6]["fpr"],Score.iloc[6]["tpr"])))
plt.plot(Score.iloc[7]["fpr"],Score.iloc[7]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][7],auc(Score.iloc[7]["fpr"],Score.iloc[7]["tpr"])))
plt.plot(Score.iloc[8]["fpr"],Score.iloc[8]["tpr"],label="{0} : {1:.3f}".format(Score["Name"][8],auc(Score.iloc[8]["fpr"],Score.iloc[8]["tpr"])))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right",prop={'size':20})
plt.show()
Model Comparision
model=[LogisticRegression(),DecisionTreeClassifier(random_state=10),RandomForestClassifier(random_state=10),AdaBoostClassifier(random_state=10),GradientBoostingClassifier(random_state=10),XGBClassifier(random_state=10),KNeighborsClassifier(),GaussianNB()]
name=["lr","dt","rf","ab","gb","xgb","knn","nb"]
models=list(zip(model,name))
models
##### here models of both train and test have been compared for different metrics for comparision
plt.rcParams['figure.figsize'] = (20, 10)
import matplotlib
matplotlib.rc('xtick', labelsize=20)
matplotlib.rc('ytick', labelsize=20)
# train accuracy after over sampling
results = []
names = []
scoring = 'accuracy'
for model,name in models:
kfold = KFold(n_splits=20)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, np.mean(cv_results), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
lr: 0.698419 (0.018886) dt: 0.877437 (0.076622) rf: 0.896775 (0.066229) ab: 0.745205 (0.019846) gb: 0.756401 (0.018695) [15:28:27] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:28] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:28] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:31] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:31] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:33] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:28:35] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. xgb: 0.805720 (0.019015) knn: 0.721924 (0.082328) nb: 0.684055 (0.031081)
# Test recall
results = []
names = []
scoring = 'recall'
for model,name in models:
kfold = KFold(n_splits=20)
cv_results = cross_val_score(model, X_test, y_test, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, np.mean(cv_results), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
lr: 0.196902 (0.043316) dt: 0.423750 (0.066451) rf: 0.356588 (0.068970) ab: 0.252928 (0.061462) gb: 0.387444 (0.087898) [15:29:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:03] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:04] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:05] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:06] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. [15:29:06] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. xgb: 0.383029 (0.071856) knn: 0.170709 (0.053781) nb: 0.493560 (0.074946)
param_range=np.arange(100,120)
train,test=validation_curve(GradientBoostingClassifier(random_state=10),X_train,y_train,param_name="max_depth",
param_range=param_range,scoring="accuracy",n_jobs=-1,cv=10)
train_mean=np.mean(train,axis=1)
test_mean=np.mean(test,axis=1)
plt.plot(param_range,train_mean,label="train")
plt.plot(param_range,test_mean,label="test")
plt.ylim([0,1.1])
plt.legend()
plt.show()
param_range=np.arange(0,1,0.1)
train,test=validation_curve(GradientBoostingClassifier(random_state=10),X_train,y_train,param_name="learning_rate",
param_range=param_range,scoring="accuracy",n_jobs=-1,cv=10)
train_mean=np.mean(train,axis=1)
test_mean=np.mean(test,axis=1)
plt.plot(param_range,train_mean,label="train")
plt.plot(param_range,test_mean,label="test")
plt.ylim([0,1.1])
plt.legend()
plt.show()
param_range=np.arange(1,60)
train,test=validation_curve(GradientBoostingClassifier(random_state=10),X_train,y_train,param_name="min_samples_split",
param_range=param_range,scoring="accuracy",n_jobs=-1,cv=10)
train_mean=np.mean(train,axis=1)
test_mean=np.mean(test,axis=1)
plt.plot(param_range,train_mean,label="train")
plt.plot(param_range,test_mean,label="test")
plt.ylim([0,1.1])
plt.legend()
plt.show()
param_range=np.arange(400,410)
train,test=validation_curve(GradientBoostingClassifier(random_state=10),X_train,y_train,param_name="n_estimators",
param_range=param_range,scoring="accuracy",n_jobs=-1,cv=10)
train_mean=np.mean(train,axis=1)
test_mean=np.mean(test,axis=1)
plt.plot(param_range,train_mean,label="train")
plt.plot(param_range,test_mean,label="test")
plt.ylim([0,1.1])
plt.legend()
plt.show()
from sklearn.model_selection import GridSearchCV
params={'n_estimators':np.arange(400,410),'learning_rate':np.arange(0.1,0.15,0.01),'min_samples_split':np.arange(2,5)}
gs_rf=GridSearchCV(GradientBoostingClassifier(random_state=10),params,cv=5,n_jobs=-1).fit(X_train,y_train)
print(gs_rf.best_params_)
print(gs_rf.best_score_)#accuracy
{'learning_rate': 0.13999999999999999, 'min_samples_split': 3, 'n_estimators': 400}
0.7753092609612657
params={'n_estimators':np.arange(400,410),'learning_rate':np.arange(0.1,0.15,0.01),'min_samples_split':np.arange(2,5)}
gs_rf=GridSearchCV(GradientBoostingClassifier(random_state=10),params,cv=5,n_jobs=-1).fit(X_train,y_train)
print(gs_rf.best_params_)
print(gs_rf.best_score_)#recall
{'learning_rate': 0.13999999999999999, 'min_samples_split': 3, 'n_estimators': 400}
0.7753092609612657
gb=GradientBoostingClassifier(n_estimators=400,learning_rate=0.01,min_samples_split=2,random_state=11)
pipe= Pipeline([['classifier',gb]])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
proba=pipe.predict_proba(X_test)[:,1]
print("train score : ",pipe.score(X_train,y_train))
print("test score : ",pipe.score(X_test,y_test))
print()
cm=confusion_matrix(y_test,y_pred)
print("confusion matrix : \n ",confusion_matrix(y_test,y_pred))
print()
print("precision score :",precision_score(y_test,y_pred))
print("recall score : ",recall_score(y_test,y_pred))
print("auc : ",roc_auc_score(y_test,proba))
print()
print("classification report : \n",classification_report(y_test,y_pred))
fpr1,tpr1,thresh=roc_curve(y_test,proba)
train score : 0.7586251960271825
test score : 0.7505219206680585
confusion matrix :
[[2183 720]
[ 236 693]]
precision score : 0.49044585987261147
recall score : 0.7459634015069968
auc : 0.7910288788518023
classification report :
precision recall f1-score support
0 0.90 0.75 0.82 2903
1 0.49 0.75 0.59 929
accuracy 0.75 3832
macro avg 0.70 0.75 0.71 3832
weighted avg 0.80 0.75 0.76 3832
gb=GradientBoostingClassifier(n_estimators=85,learning_rate=0.1,min_samples_split=2,random_state=11)
cross_val_score(rf,X_train,y_train,cv=10,scoring="accuracy")
array([0.77351916, 0.76393728, 0.75958188, 0.77743902, 0.7521777 ,
0.76655052, 0.77342048, 0.75860566, 0.7664488 , 0.76862745])
gb=GradientBoostingClassifier(n_estimators=85,learning_rate=0.1,min_samples_split=2,random_state=11)
cross_val_score(gb,X_train,y_train,cv=10,scoring="recall")
array([0.76916376, 0.72735192, 0.73867596, 0.76045296, 0.7325784 ,
0.73170732, 0.76547515, 0.74542284, 0.74651568, 0.77003484])
y_pred[0:15]
array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)
y_test.head(15)
14300 0 12154 0 14208 0 16660 0 11854 0 10567 0 17605 0 11908 0 613 0 3492 0 12330 0 12884 0 18354 0 18874 0 869 0 Name: job_change, dtype: int64
!pip install lime
Requirement already satisfied: lime in c:\users\urvashi rawat\anaconda3\lib\site-packages (0.2.0.1) Requirement already satisfied: scikit-learn>=0.18 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from lime) (0.24.1) Requirement already satisfied: scikit-image>=0.12 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from lime) (0.18.1) Requirement already satisfied: matplotlib in c:\users\urvashi rawat\anaconda3\lib\site-packages (from lime) (3.3.4) Requirement already satisfied: numpy in c:\users\urvashi rawat\anaconda3\lib\site-packages (from lime) (1.20.1) Requirement already satisfied: tqdm in c:\users\urvashi rawat\anaconda3\lib\site-packages (from lime) (4.59.0) Requirement already satisfied: scipy in c:\users\urvashi rawat\anaconda3\lib\site-packages (from lime) (1.6.2) Requirement already satisfied: networkx>=2.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2.5) Requirement already satisfied: pillow!=7.1.0,!=7.1.1,>=4.3.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (8.2.0) Requirement already satisfied: imageio>=2.3.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2.9.0) Requirement already satisfied: tifffile>=2019.7.26 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2021.4.8) Requirement already satisfied: PyWavelets>=1.1.1 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (1.1.1) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from matplotlib->lime) (2.4.7) Requirement already satisfied: python-dateutil>=2.1 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from matplotlib->lime) (2.8.1) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from matplotlib->lime) (1.3.1) Requirement already satisfied: cycler>=0.10 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from matplotlib->lime) (0.10.0) Requirement already satisfied: six in c:\users\urvashi rawat\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib->lime) (1.15.0) Requirement already satisfied: decorator>=4.3.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from networkx>=2.0->scikit-image>=0.12->lime) (5.0.6) Requirement already satisfied: joblib>=0.11 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-learn>=0.18->lime) (1.0.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-learn>=0.18->lime) (2.1.0)
import lime
from lime.lime_tabular import LimeTabularExplainer
class_names = [0, 1]
#instantiate the explanations for the data set
limeexplainer = LimeTabularExplainer(X_test.values, class_names=class_names, feature_names = X_test.columns,
discretize_continuous = True)
idx=5 # the rows of the dataset
explainable_exp = limeexplainer.explain_instance(X_test.values[idx], pipe.predict_proba, num_features=3, labels=class_names)
explainable_exp.show_in_notebook(show_table=True, show_all=False)
idx=14
explainable_exp = limeexplainer.explain_instance(X_test.values[idx], pipe.predict_proba, num_features=3, labels=class_names)
explainable_exp.show_in_notebook(show_table=True, show_all=False)
! pip install shap
Requirement already satisfied: shap in c:\users\urvashi rawat\anaconda3\lib\site-packages (0.39.0) Requirement already satisfied: slicer==0.0.7 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (0.0.7) Requirement already satisfied: numpy in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (1.20.1) Requirement already satisfied: pandas in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (1.2.4) Requirement already satisfied: cloudpickle in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (1.6.0) Requirement already satisfied: scipy in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (1.6.2) Requirement already satisfied: numba in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (0.53.1) Requirement already satisfied: scikit-learn in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (0.24.1) Requirement already satisfied: tqdm>4.25.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from shap) (4.59.0) Requirement already satisfied: llvmlite<0.37,>=0.36.0rc1 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from numba->shap) (0.36.0) Requirement already satisfied: setuptools in c:\users\urvashi rawat\anaconda3\lib\site-packages (from numba->shap) (52.0.0.post20210125) Requirement already satisfied: python-dateutil>=2.7.3 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from pandas->shap) (2.8.1) Requirement already satisfied: pytz>=2017.3 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from pandas->shap) (2021.1) Requirement already satisfied: six>=1.5 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from python-dateutil>=2.7.3->pandas->shap) (1.15.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-learn->shap) (2.1.0) Requirement already satisfied: joblib>=0.11 in c:\users\urvashi rawat\anaconda3\lib\site-packages (from scikit-learn->shap) (1.0.1)
import shap
# load JS visualization code to notebook
shap.initjs()
explainer = shap.TreeExplainer(model = gb)
shap_values = explainer.shap_values(X_test)
explainer = shap.TreeExplainer(model = gb)
shap_values = explainer.shap_values(X_test)
row=5
shap.force_plot(
base_value=explainer.expected_value,
shap_values=shap_values[row,:],
features=X_test.iloc[row,:],
feature_names=X_test.columns,
link="identity",
out_names="Yes")
shap.decision_plot(
base_value=explainer.expected_value,
shap_values=shap_values[row,:],
features=X_test.iloc[row,:],
feature_names=X_test.columns.tolist(),
link="identity",
new_base_value=0.5)
row=14
shap.force_plot(
base_value=explainer.expected_value,
shap_values=shap_values[row,:],
features=X_test.iloc[row],
feature_names=X_test.columns,
link="identity",
out_names="Yes")
shap.decision_plot(
base_value=explainer.expected_value,
shap_values=shap_values[row,:],
features=X_test.iloc[row,:],
feature_names=X_test.columns.tolist(),
link="identity",
new_base_value=0.5)
row=2
shap.force_plot(
base_value=explainer.expected_value,
shap_values=shap_values[row,:],
features=X_test.iloc[row],
feature_names=X_test.columns,
link="identity",
out_names="Yes")